library(tidyverse)
library(plotly)
library(sf)
library(mapview)
library(tigris)
library(censusapi)
library(leaflet)
library(lehdr)
library(usmap)
library(lmtest)
library(pracma)
library(lmtest)
library(forecast)
library(vars)
library(rvest)
library(RSelenium)
library(seleniumPipes)
library(dLagM)
library(jsonlite)
library(rgdal)
library(esri2sf)
library(readr)

options(
  tigris_class = "sf",
  tigris_use_cache = TRUE
)

Sys.setenv(CENSUS_KEY="10dcd73d7c043e91bac9fb8d3989cbff54b08790")

# Load the processing functions. This also loads the normalization functions.
source("safegraph_process_patterns_functions.R")

# SET your path here to the covid19analysis folder.
sg_path <- '/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/'

# SET your path here to the github covid folder
github_path  <- ''

Zip code level analysis

Santa Clara County and San Francisco zip code cases

scc_map_cases <- esri2sf("https://services2.arcgis.com/RiZWfy7B1r76pKTz/arcgis/rest/services/COVID_zip_FC/FeatureServer/0")
## [1] "Feature Layer"
## [1] "esriGeometryPolygon"
sf_place_cases <- 
  read_csv("https://raw.githubusercontent.com/datadesk/california-coronavirus-data/master/latimes-place-totals.csv") %>%
  filter(county == 'San Francisco')

Get social distancing data

scc_blockgroups <-
  block_groups("CA","Santa Clara", cb=F, progress_bar=F) %>% 
  st_transform('+proj=longlat +datum=WGS84')

sf_blockgroups <- 
  block_groups("CA","San Francisco", cb=F, progress_bar=F) %>% 
  st_transform('+proj=longlat +datum=WGS84')

# zip code areas
zctas_94 <- 
  zctas(cb = F, starts_with = "94") %>% 
  st_transform('+proj=longlat +datum=WGS84') %>%
  dplyr::select(ZCTA5CE10, geometry)

zctas_95 <- 
  zctas(cb = F, starts_with = "95") %>% 
  st_transform('+proj=longlat +datum=WGS84') %>%
  dplyr::select(ZCTA5CE10, geometry)

zctas_94_95 <- rbind(zctas_94, zctas_95)

# join with block groups
scc_bg_zctas <- scc_blockgroups %>% 
  st_centroid() %>%
  st_join(zctas_94_95) %>% 
  dplyr::select(GEOID, ZCTA5CE10) %>%
  rename(blockgroup = GEOID, zipcode = ZCTA5CE10)

sf_bg_zctas <- sf_blockgroups %>% 
  st_centroid() %>%
  st_join(zctas_94) %>% 
  dplyr::select(GEOID, ZCTA5CE10) %>%
  rename(blockgroup = GEOID, zipcode = ZCTA5CE10)

bay_sd <- readRDS("/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/bay_socialdistancing_v2.rds") %>% 
  mutate(date = date_range_start %>%  substr(1,10) %>% as.Date())

# obtaining weekends
weekends <- bay_sd %>% 
  filter(!duplicated(date)) %>% 
  arrange(date) %>% 
  mutate(weekend = ifelse((date %>% as.numeric()) %% 7 %in% c(2,3), T, F)) %>% 
  dplyr::select(date,weekend)

bay_sd <- bay_sd %>% left_join(weekends)

SF zip code over time analysis

Over time cases by zip code

First just plot the cases over time by zip code

# obtain the saved census data 
acs_vars = readRDS("/Users/simonespeizer/Documents/2020 Spring Quarter/CEE 218Z/censusData2018_acs_acs5.rds")

# define a function for pulling census data
pullCensus <- function(variableToPull, county) {
    regionString <- paste0("state:06+county:", county)
    censusData <- getCensus(
      name = "acs/acs5",
      vintage = 2018,
      region = "block group:*", 
      regionin = regionString,
      vars = variableToPull
    ) %>%
    mutate(blockgroup = paste0(state,county,tract,block_group)) %>%
      select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME"))
  
  return(censusData)
}


# get population data for San Francisco
sf_fips <- fips("CA", "San Francisco") %>% substr(3,5)

sf_pop_zip <- pullCensus("B01003_001E", sf_fips) %>% 
  left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  group_by(zipcode) %>%
  summarize(total_pop_zip = sum(B01003_001E))

sf_cases_zip <- sf_place_cases %>% left_join(sf_pop_zip, by = c("place" = "zipcode")) %>%
  mutate(cases_by_pop = confirmed_cases / total_pop_zip) %>%
  rename(zipcode = place)

sf_cases_zip %>% ggplot(
  aes(x = date)) +
    geom_line(aes(y = cases_by_pop, color=zipcode))

Something weird is going on with case counts on May 7th and 11th, so I remove those dates manually. Also adjust the day in zip code 94117 that is off.

sf_cases_zip_edit <- sf_cases_zip %>% filter(date != "2020-05-07" & date != "2020-05-11") %>% mutate(confirmed_cases = ifelse(zipcode == "94117" & confirmed_cases == 1, confirmed_cases + 37, confirmed_cases), cases_by_pop = confirmed_cases / total_pop_zip)

sf_cases_zip_edit %>% ggplot(
  aes(x = date)) +
    geom_line(aes(y = cases_by_pop, color=zipcode))

sf_cases_zip_edit %>% ggplot(
  aes(x = date)) +
    geom_line(aes(y = confirmed_cases, color=zipcode))

Bucket by movement right before shelter in place

Get social distancing data by zip code for the first few days before shelter-in-place

# relevant dates
pre_sd_date_cutoff <- as.Date("2020-03-01")
shelter_date <- as.Date("2020-03-17") # note that this is the day the order went into effect

# get daily social distancing data for SF by date
sf_sd_zip_by_date <- bay_sd %>%
  filter(origin_census_block_group %in% sf_blockgroups$GEOID) %>%
  left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode), by = c("origin_census_block_group" = "blockgroup")) %>%
  group_by(zipcode, date) %>%
  summarize(total_at_home_zip = sum(completely_home_device_count), total_devices = sum(device_count)) %>%
  ungroup()

# get block group averages leaving home before shelter in place/COVID-19
sf_pre_sd_leaving_avg <- bay_sd %>%
  filter(origin_census_block_group %in% sf_blockgroups$GEOID) %>%
  filter(date <= pre_sd_date_cutoff)  %>%
  left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode), by = c("origin_census_block_group" = "blockgroup")) %>%
  group_by(zipcode) %>%
  summarize(total_at_home_zip = sum(completely_home_device_count), total_devices = sum(device_count)) %>%
  mutate(
    percent_at_home_pre = total_at_home_zip*100/total_devices,
    percent_leaving_home_pre = (100 - percent_at_home_pre)
  )

# get average percent leaving the home for the 3 days before shelter in place
sf_sd_zip_3_days_pre <- sf_sd_zip_by_date %>%
  filter(date < shelter_date & date >= (shelter_date - 3)) %>%
  group_by(zipcode) %>%
  summarize(total_at_home_zip = sum(total_at_home_zip), total_devices = sum(total_devices)) %>% 
  mutate(percent_at_home = total_at_home_zip*100/total_devices, percent_leaving_home = (100 - percent_at_home)) %>% 
  left_join(sf_pre_sd_leaving_avg %>% dplyr::select(zipcode, percent_leaving_home_pre)) %>%
  mutate(dif_percent_leaving = percent_leaving_home - percent_leaving_home_pre)
  
sf_sd_zip_3_days_pre_buckets <- sf_sd_zip_3_days_pre %>% 
  mutate(bucketed_percent_leaving = ifelse(percent_leaving_home < 65, "65% or below", ifelse(percent_leaving_home < 70, "65% to 70%", "70% or more")), bucketed_dif_percent_leaving = ifelse(dif_percent_leaving > -5, "-5% or above", ifelse(dif_percent_leaving > -10, "-5% to -10%", ifelse(dif_percent_leaving > -15, "-10% to -15%", "-15% or below"))))

# join with the case data
sf_cases_sd_3dayspre_bucketed_abs <- left_join(sf_cases_zip_edit, sf_sd_zip_3_days_pre_buckets) %>%
  group_by(date, bucketed_percent_leaving) %>%
  summarize(total_cases_grouped = sum(confirmed_cases), total_pop_combined = sum(total_pop_zip)) %>%
  mutate(cases_by_pop_grouped = total_cases_grouped / total_pop_combined)
  
# plot
sf_cases_sd_3dayspre_bucketed_abs %>% filter(!is.na(bucketed_percent_leaving)) %>% ggplot(aes(x = date)) + 
  geom_line(aes(y = cases_by_pop_grouped, color=bucketed_percent_leaving)) + 
  ggtitle('Cases per capita for zipcodes grouped by % leaving in the 3 days before SIP') + labs(y = "Cases per capita", x = "Date", color = "% leaving home")

Pretty much the opposite of what we expect. But note that the differences between the groups do not seem to be growing much in the time period we see here - maybe there were pre-existing differences between these groups in terms of their baseline case counts?

Now look at the difference in percent leaving relative to pre SIP.

# try with differences in percent leaving
sf_cases_sd_3dayspre_bucketed_rel <- left_join(sf_cases_zip_edit, sf_sd_zip_3_days_pre_buckets) %>%
  group_by(date, bucketed_dif_percent_leaving) %>%
  summarize(total_cases_grouped = sum(confirmed_cases), total_pop_combined = sum(total_pop_zip)) %>%
  mutate(cases_by_pop_grouped = total_cases_grouped / total_pop_combined)
  
# plot
sf_cases_sd_3dayspre_bucketed_rel %>% filter(!is.na(bucketed_dif_percent_leaving)) %>% ggplot(aes(x = date)) + geom_line(aes(y = cases_by_pop_grouped, color=bucketed_dif_percent_leaving)) + ggtitle('Cases per capita for zipcodes grouped by % leaving in the 3 days before SIP') + labs(y = "Cases per capita", x = "Date", color = "dif in % leaving home relative to pre SIP")

This actually looks more like what we would expect, except for the -5% or greater bucket, which do note is only composed of 3 zip codes.

Bucket by movement right after shelter in place

Try now with the 3 days after SIP.

# get average percent leaving the home for the 3 days afterter in place
sf_sd_zip_3_days_post <- sf_sd_zip_by_date %>%
  filter(date >= shelter_date & date < (shelter_date + 3)) %>%
  group_by(zipcode) %>%
  summarize(total_at_home_zip = sum(total_at_home_zip), total_devices = sum(total_devices)) %>% 
  mutate(percent_at_home = total_at_home_zip*100/total_devices, percent_leaving_home = (100 - percent_at_home)) %>% 
  left_join(sf_pre_sd_leaving_avg %>% dplyr::select(zipcode, percent_leaving_home_pre)) %>%
  mutate(dif_percent_leaving = percent_leaving_home - percent_leaving_home_pre)
  

sf_sd_zip_3_days_post %>% ggplot(aes(x = zipcode, y = percent_leaving_home)) + geom_point()

sf_sd_zip_3_days_post %>% ggplot(aes(x = zipcode, y = dif_percent_leaving)) + geom_point()

sf_sd_zip_3_days_post_buckets <- sf_sd_zip_3_days_post %>% 
  mutate(bucketed_percent_leaving = ifelse(percent_leaving_home < 55, "55% or below", ifelse(percent_leaving_home < 60, "55% to 60%", ifelse(percent_leaving_home < 65, "60% to 65%","65% or more"))), bucketed_dif_percent_leaving = ifelse(dif_percent_leaving > -10, "-10% or above", ifelse(dif_percent_leaving > -15, "-10% to -15%", ifelse(dif_percent_leaving > -20, "-15% to -20%", ifelse(dif_percent_leaving > -25, "-20% to -25%", "-25% or below")))))

# join with the case data
sf_cases_sd_3dayspost_bucketed_abs <- left_join(sf_cases_zip_edit, sf_sd_zip_3_days_post_buckets) %>%
  group_by(date, bucketed_percent_leaving) %>%
  summarize(total_cases_grouped = sum(confirmed_cases), total_pop_combined = sum(total_pop_zip)) %>%
  mutate(cases_by_pop_grouped = total_cases_grouped / total_pop_combined)
  
# plot
sf_cases_sd_3dayspost_bucketed_abs %>% filter(!is.na(bucketed_percent_leaving)) %>% ggplot(aes(x = date)) + geom_line(aes(y = cases_by_pop_grouped, color=bucketed_percent_leaving)) + ggtitle('Cases per capita for zipcodes grouped by % leaving in the 3 days on and after SIP') + labs(y = "Cases per capita", x = "Date", color = "% leaving home")

Again pretty much the opposite of expectations.

# try with differences in percent leaving
sf_cases_sd_3dayspost_bucketed_rel <- left_join(sf_cases_zip_edit, sf_sd_zip_3_days_post_buckets) %>%
  group_by(date, bucketed_dif_percent_leaving) %>%
  summarize(total_cases_grouped = sum(confirmed_cases), total_pop_combined = sum(total_pop_zip)) %>%
  mutate(cases_by_pop_grouped = total_cases_grouped / total_pop_combined)
  
# plot
sf_cases_sd_3dayspost_bucketed_rel %>% filter(!is.na(bucketed_dif_percent_leaving)) %>% ggplot(aes(x = date)) + geom_line(aes(y = cases_by_pop_grouped, color=bucketed_dif_percent_leaving)) + ggtitle('Cases per capita for zipcodes grouped by % leaving in the 3 days on and after SIP') + labs(y = "Cases per capita", x = "Date", color = "dif in % leaving home relative to pre SIP")

Slightly more as expected, but still weird.

Bucket by movement through April 20th

Try the whole time from SIP through April 20th, the first day that we have case data

# get average percent leaving the home for the 3 days after shelter in place
sf_sd_zip_till420 <- sf_sd_zip_by_date %>%
  filter(date >= shelter_date & date < ("2020-04-20")) %>%
  group_by(zipcode) %>%
  summarize(total_at_home_zip = sum(total_at_home_zip), total_devices = sum(total_devices)) %>% 
  mutate(percent_at_home = total_at_home_zip*100/total_devices, percent_leaving_home = (100 - percent_at_home)) %>% 
  left_join(sf_pre_sd_leaving_avg %>% dplyr::select(zipcode, percent_leaving_home_pre)) %>%
  mutate(dif_percent_leaving = percent_leaving_home - percent_leaving_home_pre)
  

sf_sd_zip_till420 %>% ggplot(aes(x = zipcode, y = percent_leaving_home)) + geom_point()

sf_sd_zip_till420 %>% ggplot(aes(x = zipcode, y = dif_percent_leaving)) + geom_point()

sf_sd_zip_till420_buckets <- sf_sd_zip_3_days_post %>% 
  mutate(bucketed_percent_leaving = ifelse(percent_leaving_home < 50, "50% or below", ifelse(percent_leaving_home < 55, "50% to 55%", ifelse(percent_leaving_home < 60, "55% to 60%", "60% or more"))), bucketed_dif_percent_leaving = ifelse(dif_percent_leaving > -15, "-15% or above", ifelse(dif_percent_leaving > -20, "-15% to -20%", ifelse(dif_percent_leaving > -25, "-20% to -25%", "-25% or below"))))

# join with the case data
sf_cases_sd_till420_bucketed_abs <- left_join(sf_cases_zip_edit, sf_sd_zip_till420_buckets) %>%
  group_by(date, bucketed_percent_leaving) %>%
  summarize(total_cases_grouped = sum(confirmed_cases), total_pop_combined = sum(total_pop_zip)) %>%
  mutate(cases_by_pop_grouped = total_cases_grouped / total_pop_combined)
  
# plot
sf_cases_sd_till420_bucketed_abs %>% filter(!is.na(bucketed_percent_leaving)) %>% ggplot(aes(x = date)) + geom_line(aes(y = cases_by_pop_grouped, color=bucketed_percent_leaving)) + ggtitle('Cases per capita for zipcodes grouped by % leaving since SIP') + labs(y = "Cases per capita", x = "Date", color = "% leaving home")

Same surprising trend, but again the differences between the top two groups don’t seem to grow a ton (though they do relative to the bottom one).

Let’s look at this in scatter plot form. In the plot below each point represents a date, and again we’re looking at bucketed zip codes by their movement.

# scatter plot 
sf_cases_sd_till420_bucketed_abs %>% filter(!is.na(bucketed_percent_leaving)) %>% ggplot(aes(x = bucketed_percent_leaving, y = cases_by_pop_grouped)) + geom_point() + ggtitle('Cases by population through current date, grouped by movement, SF')

Again look at differences in percent leaving.

# try with differences in percent leaving
sf_cases_sd_till420_bucketed_rel <- left_join(sf_cases_zip_edit, sf_sd_zip_till420_buckets) %>%
  group_by(date, bucketed_dif_percent_leaving) %>%
  summarize(total_cases_grouped = sum(confirmed_cases), total_pop_combined = sum(total_pop_zip)) %>%
  mutate(cases_by_pop_grouped = total_cases_grouped / total_pop_combined)
  
# plot
sf_cases_sd_till420_bucketed_rel %>% filter(!is.na(bucketed_dif_percent_leaving)) %>% ggplot(aes(x = date)) + geom_line(aes(y = cases_by_pop_grouped, color=bucketed_dif_percent_leaving)) + ggtitle('Cases per capita for zipcodes grouped by % leaving since SIP') + labs(y = "Cases per capita", x = "Date", color = "dif in % leaving home relative to pre SIP")

Except for the -15% or above, more as we’d expect.

Bucket by case growth and look at movement

Looking back at the original case growth over time by zip codes, it looks like there are generally two groups–one set of zip codes that have >0.003 cases per capita, and one set that have <0.002 cases per capita by May 18th. The first set have rapid growths, while those in the second set generally have less if any growth. Let’s try breaking it up by cases.

sf_cases_zip_bucketed <- sf_cases_zip_edit %>% 
  filter(date == max(date)) %>%
  mutate(bucketed_case_level = ifelse(cases_by_pop <=0.002, "< 0.002", "> 0.002"))

sf_sd_zip_cases_bucket <- left_join(sf_cases_zip_bucketed %>% dplyr::select(-date), sf_sd_zip_by_date)

sf_sd_zip_cases_bucket_movement <- sf_sd_zip_cases_bucket %>% filter(!is.na(bucketed_case_level)) %>%
  group_by(bucketed_case_level, date) %>%
  summarize(total_at_home_bucket = sum(total_at_home_zip), total_devices = sum(total_devices)) %>% 
  mutate(percent_at_home = total_at_home_bucket*100/total_devices, percent_leaving_home = (100 - percent_at_home))

sf_sd_zip_cases_bucket_movement %>% filter(!is.na(bucketed_case_level)) %>%  ggplot(aes(x = date)) + 
  geom_line(aes(y = percent_leaving_home, color=bucketed_case_level)) + ggtitle('% leaving home over time grouped by cases per capita') + labs(y = "% leaving home", x = "Date", color = "cases per capita by May 18th")

Same figure but with plot_ly for more interactivity.

fig_sf_movt_case_bucket <- sf_sd_zip_cases_bucket_movement %>% filter(!is.na(bucketed_case_level)) %>% 
  plot_ly() %>%
  add_lines(x = ~date, y = ~percent_leaving_home, color = ~bucketed_case_level) %>%
  layout(xaxis = list(title = 'Date'), yaxis = list(title = '% leaving home'), legend = list(title = list(text = "Cases per capita by May 18th")), title = "% leaving home over time grouped by cases per capita")

fig_sf_movt_case_bucket

Also pretty weird. Try looking just at after shelter in place, and the difference relative to before.

# try looking just after shelter in place
# get pre shelter in place averages
sf_sd_zip_cases_bucket_movement_pre_avg <- sf_sd_zip_cases_bucket_movement %>% filter(date < pre_sd_date_cutoff) %>% ungroup() %>%
  group_by(bucketed_case_level) %>%
  summarize(total_at_home_pre = sum(total_at_home_bucket), total_devices = sum(total_devices)) %>%
  mutate(percent_at_home= total_at_home_pre*100/total_devices, percent_leaving_home_pre = (100-percent_at_home))

sf_sd_zip_cases_bucket_movement_post <- sf_sd_zip_cases_bucket_movement %>% filter(date >= shelter_date) %>% 
  left_join(sf_sd_zip_cases_bucket_movement_pre_avg %>% dplyr::select(bucketed_case_level, percent_leaving_home_pre)) %>%
  mutate(dif_percent_leaving = percent_leaving_home - percent_leaving_home_pre)

sf_sd_zip_cases_bucket_movement_post %>% filter(!is.na(bucketed_case_level)) %>% ggplot(aes(x = date)) + 
  geom_line(aes(y = dif_percent_leaving, color=bucketed_case_level)) + ggtitle('% leaving home over time (relative to pre SIP) grouped by cases per capita') + labs(y = "difference in % leaving home", x = "Date", color = "cases per capita by May 18th")

fig_sf_movt_case_bucket_post <- sf_sd_zip_cases_bucket_movement_post %>% filter(!is.na(bucketed_case_level)) %>% 
  plot_ly() %>%
  add_lines(x = ~date, y = ~dif_percent_leaving, color = ~bucketed_case_level) %>%
  layout(xaxis = list(title = 'Date'), yaxis = list(title = 'difference in % leaving home'), legend = list(title = list(text = "Cases per capita by May 18th")), title = "% leaving home over time (relative to pre SIP) grouped by cases per capita")

fig_sf_movt_case_bucket_post

That does look more like what we would expect.

Santa Clara County case data

First just visualize what the case data looks like.

scc_cases_zip <- scc_map_cases %>% 
  dplyr::select(Zipcode, Cases, Population) %>% 
  st_drop_geometry() %>%
  rename(zipcode = Zipcode, cases = Cases, pop = Population) %>%
  mutate(cases = replace_na(cases, 0)) %>%
  mutate(cases_by_pop = cases/pop) %>%
  filter(is.numeric(cases_by_pop))

scc_cases_zip %>% ggplot(aes(x = zipcode, y = cases_by_pop)) + geom_point()

Visualize zip code level leaving home data for SCC.

# get leaving home data on zip code level
scc_sd_zip_by_date <- bay_sd %>%
  filter(origin_census_block_group %in% scc_blockgroups$GEOID) %>%
  left_join(scc_bg_zctas %>% dplyr::select(blockgroup, zipcode), by = c("origin_census_block_group" = "blockgroup")) %>% 
  group_by(zipcode, date) %>%
  summarize(total_at_home_zip = sum(completely_home_device_count), total_devices = sum(device_count)) %>% 
  mutate(
    percent_at_home = total_at_home_zip*100/total_devices, 
    percent_leaving_home = (100 - percent_at_home)
  )

scc_sd_zip_by_date %>% ggplot(aes(x = date)) + geom_line(aes(y = percent_leaving_home, color = zipcode))

Try first looking at the overall % leaving home since SIP and cases as of pulled on May 20th.

# get zip code averages leaving home before SIP
scc_sd_zip_pre_avgs <- scc_sd_zip_by_date %>%
  filter(date < pre_sd_date_cutoff) %>%
  group_by(zipcode) %>%
  summarize(total_at_home_pre = sum(total_at_home_zip), total_devices = sum(total_devices)) %>%
  mutate(percent_at_home_pre = total_at_home_pre*100/total_devices, percent_leaving_home_pre = (100 - percent_at_home_pre))

scc_sd_zip_post <- scc_sd_zip_by_date %>%
  filter(date >= shelter_date) %>%
  group_by(zipcode) %>%
  summarize(total_at_home = sum(total_at_home_zip), total_devices = sum(total_devices)) %>%
  mutate(percent_at_home = total_at_home*100/total_devices, percent_leaving_home = (100 - percent_at_home)) %>%
  left_join(scc_sd_zip_pre_avgs %>% dplyr::select(zipcode, percent_leaving_home_pre)) %>%
  mutate(dif_percent_leaving = percent_leaving_home - percent_leaving_home_pre) %>%
  left_join(scc_cases_zip)

# absolute percent leaving
scc_sd_zip_post %>% ggplot(
  aes(x = percent_leaving_home, y = cases_by_pop)) + geom_point() + ggtitle("Santa Clara County") + geom_smooth(method=lm)

scc_zip_cases_sd_cor_abs <- lm(cases_by_pop ~ percent_leaving_home, scc_sd_zip_post , na.action = na.omit)

summary(scc_zip_cases_sd_cor_abs)
## 
## Call:
## lm(formula = cases_by_pop ~ percent_leaving_home, data = scc_sd_zip_post, 
##     na.action = na.omit)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0012330 -0.0005192 -0.0001132  0.0003361  0.0031872 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)
## (Intercept)          9.447e-04  8.907e-04   1.061    0.294
## percent_leaving_home 4.141e-06  1.767e-05   0.234    0.816
## 
## Residual standard error: 0.0009138 on 53 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.001035,   Adjusted R-squared:  -0.01781 
## F-statistic: 0.05491 on 1 and 53 DF,  p-value: 0.8156
# try with difference in percent leaving
scc_sd_zip_post %>% ggplot(
  aes(x = dif_percent_leaving, y = cases_by_pop)) + geom_point() + ggtitle("Santa Clara County") + geom_smooth(method=lm)

scc_zip_cases_sd_cor <- lm(cases_by_pop ~ dif_percent_leaving, scc_sd_zip_post , na.action = na.omit)

summary(scc_zip_cases_sd_cor)
## 
## Call:
## lm(formula = cases_by_pop ~ dif_percent_leaving, data = scc_sd_zip_post, 
##     na.action = na.omit)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0013254 -0.0004842 -0.0001094  0.0003618  0.0032001 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)   
## (Intercept)         1.359e-03  4.463e-04   3.046  0.00361 **
## dif_percent_leaving 7.827e-06  1.614e-05   0.485  0.62976   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0009123 on 53 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.004417,   Adjusted R-squared:  -0.01437 
## F-statistic: 0.2351 on 1 and 53 DF,  p-value: 0.6298

Bucketing cases

Try bucketing by cases per capita.

# bucket case counts
scc_cases_bucketed <- scc_cases_zip %>%
  mutate(bucketed_cases_by_pop = ifelse(cases_by_pop > 0.002, "0.002 or more", ifelse(cases_by_pop > 0.0015, "0.0015 to 0.002", ifelse(cases_by_pop > 0.001, "0.001 to 0.0015", ifelse(cases_by_pop > 0, "0.00001 to 0.001", "0")))))

scc_sd_zip_by_date_case_bucket <- left_join(scc_cases_bucketed, scc_sd_zip_by_date) %>%
  filter(!is.na(bucketed_cases_by_pop) & !is.na(date)) %>%
  group_by(date, bucketed_cases_by_pop) %>%
  summarize(total_at_home_bucket = sum(total_at_home_zip), total_devices = sum(total_devices)) %>% 
  mutate(percent_at_home = total_at_home_bucket*100/total_devices, percent_leaving_home = (100 - percent_at_home)) %>% ungroup()

fig_scc_movt_case_bucket <- plot_ly(scc_sd_zip_by_date_case_bucket) %>%
  add_trace(x = ~date, y = ~percent_leaving_home, color = ~bucketed_cases_by_pop, type = 'scatter', mode = 'lines') %>%
  layout(xaxis = list(title = 'Date'), yaxis = list(title = '% leaving home'), legend = list(title = list(text = "Cases per capita")), title = "% leaving home over time grouped by cases per capita, Santa Clara County")

fig_scc_movt_case_bucket
# try looking at this in scatter plot form
scc_sd_zip_by_date_case_bucket_pre <- scc_sd_zip_by_date_case_bucket %>%
  filter(date < pre_sd_date_cutoff) %>%
  group_by(bucketed_cases_by_pop) %>%
  summarize(total_at_home_bucket = sum(total_at_home_bucket), total_devices = sum(total_devices)) %>% 
  mutate(percent_at_home_pre = total_at_home_bucket*100/total_devices, percent_leaving_home_pre = (100 - percent_at_home_pre)) %>% ungroup()

# join with pre data
scc_sd_zip_by_date_case_bucket_post <- scc_sd_zip_by_date_case_bucket %>%
  filter(date >= shelter_date) %>%
  group_by(bucketed_cases_by_pop) %>%
  summarize(total_at_home_bucket = sum(total_at_home_bucket), total_devices = sum(total_devices)) %>% 
  mutate(percent_at_home = total_at_home_bucket*100/total_devices, percent_leaving_home = (100 - percent_at_home)) %>% ungroup() %>%
  left_join(scc_sd_zip_by_date_case_bucket_pre %>% dplyr::select(bucketed_cases_by_pop, percent_leaving_home_pre)) %>%
  mutate(dif_percent_leaving = percent_leaving_home - percent_leaving_home_pre)

# plot - absolute metric
scc_sd_zip_by_date_case_bucket_post %>% ggplot(aes(x = percent_leaving_home, y = bucketed_cases_by_pop)) + geom_point() + ggtitle('Percent leaving home and cases per capita for buckets of cases per capita, Santa Clara')

# plot - relative metric
scc_sd_zip_by_date_case_bucket_post %>% ggplot(aes(x = dif_percent_leaving, y = bucketed_cases_by_pop)) + geom_point() + ggtitle('Percent leaving home and cases per capita for buckets of cases per capita, Santa Clara')

That generally looks as we’d expect it to–except for the zip codes with zero cases, but it seems like a fair hypothesis that something different might be going on in those counties anyway.

Bucketing movement

Now let’s look at bucketing movement into categories, like we did for SF, but here we can only look at the outcome of cases at a specific date not over time.

To start, just looking at bucketing overall movement since SIP.

ggplot(scc_sd_zip_post, aes(x = zipcode, y = percent_leaving_home)) + geom_point()

ggplot(scc_sd_zip_post, aes(x = zipcode, y = dif_percent_leaving)) + geom_point()

scc_sd_zip_post_bucketed <- scc_sd_zip_post %>%
  mutate(bucketed_percent_leaving = ifelse(percent_leaving_home > 60, "60% or more", ifelse(percent_leaving_home > 55, "55% to 60%", ifelse(percent_leaving_home > 50, "50% to 55%", ifelse(percent_leaving_home > 45, "45% to 50%", ifelse(percent_leaving_home > 40, "40% to 45%", "40% or less"))))), bucketed_dif_leaving = ifelse(dif_percent_leaving > 0, "0% or above", ifelse(dif_percent_leaving > -20, "-0% to -20%", ifelse(dif_percent_leaving > -25, "-20% to -25%", ifelse(dif_percent_leaving > -30, "-25% to -30%", ifelse(dif_percent_leaving > -35, "-30% to -35%", ifelse(dif_percent_leaving > -40, "-35% to -40%", "-40% or lower"))))))) %>%
  left_join(scc_cases_zip) %>%
  filter(!is.na(cases))

scc_sd_cases_post_bucketed_abs <- scc_sd_zip_post_bucketed %>% 
  group_by(bucketed_percent_leaving) %>%
  summarize(total_cases = sum(cases), total_pop = sum(pop)) %>%
  mutate(cases_by_pop_grouped = total_cases/total_pop)

scc_sd_cases_post_bucketed_abs %>% ggplot(aes(x = bucketed_percent_leaving, y = cases_by_pop_grouped)) + geom_point() + ggtitle('Santa Clara')

Wow! A trend that (generally) makes sense!

Look at the difference in percent leaving.

scc_sd_cases_post_bucketed_rel <- scc_sd_zip_post_bucketed %>% 
  group_by(bucketed_dif_leaving) %>%
  summarize(total_cases = sum(cases), total_pop = sum(pop)) %>%
  mutate(cases_by_pop_grouped = total_cases/total_pop)

scc_sd_cases_post_bucketed_rel %>% ggplot(aes(x = bucketed_dif_leaving, y = cases_by_pop_grouped)) + geom_point() + ggtitle('Santa Clara')

Not quite as much as of the expected trend, but appears for -25 to -40 or below buckets.

Updates 5/25/20

Total cases and movement

Looking at the number of cases and movement through 1 week before the most recent date on the case data.

San Francisco

sf_case_date_max <- max(sf_cases_zip$date)

sf_sd_cases_curr <- sf_sd_zip_by_date %>%
  filter(date >= shelter_date & date <= sf_case_date_max - 7) %>%
  group_by(zipcode) %>%
  summarize(total_at_home = sum(total_at_home_zip), total_devices = sum(total_devices)) %>%
  mutate(percent_at_home = total_at_home*100/total_devices, percent_leaving_home = (100 - percent_at_home)) %>%
  left_join(sf_pre_sd_leaving_avg %>% dplyr::select(zipcode, percent_leaving_home_pre)) %>%
  mutate(dif_percent_leaving = percent_leaving_home - percent_leaving_home_pre) %>%
  left_join(sf_cases_zip %>% filter(date == sf_case_date_max)) %>%
  filter(!is.na(confirmed_cases))

fig_sf_sd_cases_curr <- plot_ly(sf_sd_cases_curr) %>%
  add_trace(x = ~percent_leaving_home, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~percent_leaving_home, y = fitted(lm(sf_sd_cases_curr$cases_by_pop~ sf_sd_cases_curr$percent_leaving_home)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent of devices leaving home'), yaxis = list(title = 'Cases per capita'), title = "SFC, cases per capita and % leaving home through 1 week before case data")

model_sf_curr <- lm(cases_by_pop ~ percent_leaving_home, sf_sd_cases_curr)

summary(model_sf_curr)
## 
## Call:
## lm(formula = cases_by_pop ~ percent_leaving_home, data = sf_sd_cases_curr)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0020057 -0.0012342 -0.0007019  0.0007624  0.0031730 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)  
## (Intercept)           7.749e-03  4.269e-03   1.815   0.0825 .
## percent_leaving_home -1.067e-04  8.227e-05  -1.297   0.2075  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001687 on 23 degrees of freedom
## Multiple R-squared:  0.06816,    Adjusted R-squared:  0.02764 
## F-statistic: 1.682 on 1 and 23 DF,  p-value: 0.2075
fig_sf_sd_cases_curr
fig_sf_sd_cases_curr_dif <- plot_ly(sf_sd_cases_curr) %>%
  add_trace(x = ~dif_percent_leaving, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>% 
  add_trace(x = ~dif_percent_leaving, y = fitted(lm(sf_sd_cases_curr$cases_by_pop~ sf_sd_cases_curr$dif_percent_leaving )), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Difference in % of devices leaving home'), yaxis = list(title = 'Cases per capita'), title = "SFC, cases per capita and difference in % leaving home through 1 week before case data")

fig_sf_sd_cases_curr_dif
model_sf_curr_dif <- lm(cases_by_pop ~ dif_percent_leaving, sf_sd_cases_curr)

summary(model_sf_curr_dif)
## 
## Call:
## lm(formula = cases_by_pop ~ dif_percent_leaving, data = sf_sd_cases_curr)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0021224 -0.0011940 -0.0008579  0.0013088  0.0032503 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)
## (Intercept)         2.731e-03  1.960e-03   1.394    0.177
## dif_percent_leaving 2.177e-05  8.375e-05   0.260    0.797
## 
## Residual standard error: 0.001745 on 23 degrees of freedom
## Multiple R-squared:  0.002927,   Adjusted R-squared:  -0.04042 
## F-statistic: 0.06753 on 1 and 23 DF,  p-value: 0.7973

Santa Clara

Note the need to manually include and update the date here.

scc_case_date_max <- as.Date("2020-05-24") # need to manually include and update this

scc_sd_cases_curr <- scc_sd_zip_by_date %>%
  filter(date >= shelter_date & date <= scc_case_date_max - 7) %>%
  group_by(zipcode) %>%
  summarize(total_at_home = sum(total_at_home_zip), total_devices = sum(total_devices)) %>%
  mutate(percent_at_home = total_at_home*100/total_devices, percent_leaving_home = (100 - percent_at_home)) %>%
  left_join(scc_sd_zip_pre_avgs %>% dplyr::select(zipcode, percent_leaving_home_pre)) %>%
  mutate(dif_percent_leaving = percent_leaving_home - percent_leaving_home_pre) %>%
  left_join(scc_cases_zip) %>%
  filter(!is.na(cases))

fig_scc_sd_cases_curr <- plot_ly(scc_sd_cases_curr) %>%
  add_trace(x = ~percent_leaving_home, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~percent_leaving_home, y = fitted(lm(scc_sd_cases_curr$cases_by_pop~ scc_sd_cases_curr$percent_leaving_home)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent of devices leaving home'), yaxis = list(title = 'Cases per capita'), title = "SCC, cases per capita and % leaving home through 1 week before case data")

fig_scc_sd_cases_curr
model_scc_curr <- lm(cases_by_pop ~ percent_leaving_home, scc_sd_cases_curr)

summary(model_scc_curr)
## 
## Call:
## lm(formula = cases_by_pop ~ percent_leaving_home, data = scc_sd_cases_curr)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0012187 -0.0005234 -0.0001162  0.0003357  0.0031891 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)
## (Intercept)          9.901e-04  8.679e-04   1.141    0.259
## percent_leaving_home 3.239e-06  1.725e-05   0.188    0.852
## 
## Residual standard error: 0.000914 on 53 degrees of freedom
## Multiple R-squared:  0.000665,   Adjusted R-squared:  -0.01819 
## F-statistic: 0.03527 on 1 and 53 DF,  p-value: 0.8518
fig_scc_sd_cases_curr_dif <- plot_ly(scc_sd_cases_curr) %>%
  add_trace(x = ~dif_percent_leaving, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~dif_percent_leaving, y = fitted(lm(scc_sd_cases_curr$cases_by_pop~ scc_sd_cases_curr$dif_percent_leaving)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Difference in % of devices leaving home'), yaxis = list(title = 'Cases per capita'), title = "SCC, cases per capita and difference in % leaving home through 1 week before case data")

fig_scc_sd_cases_curr_dif
model_scc_curr_dif <- lm(cases_by_pop ~ dif_percent_leaving, scc_sd_cases_curr)

summary(model_scc_curr_dif)
## 
## Call:
## lm(formula = cases_by_pop ~ dif_percent_leaving, data = scc_sd_cases_curr)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0013084 -0.0004895 -0.0001059  0.0003605  0.0031962 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)   
## (Intercept)         1.336e-03  4.395e-04   3.039  0.00368 **
## dif_percent_leaving 6.911e-06  1.581e-05   0.437  0.66385   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0009126 on 53 degrees of freedom
## Multiple R-squared:  0.003591,   Adjusted R-squared:  -0.01521 
## F-statistic: 0.191 on 1 and 53 DF,  p-value: 0.6639

Look at the outliers in this data. The zip codes with no cases are, with their populations:

# find zip codes with 0 cases
zero_case_scc <- scc_sd_cases_curr %>% filter(cases == 0)

print(zero_case_scc$zipcode)
## [1] "94043" "94305" "95002" "95046" "95053" "95113" "95119"

These zip codes have populations:

print(zero_case_scc$pop)
## [1] 31488 15730  2146  6025  3678  1939 10472

Some of these are very small but some are large. Let’s map these with their populations.

for_mapping_zero_case_scc <- left_join(zctas_94_95, zero_case_scc %>% dplyr::select(zipcode, pop), by = c("ZCTA5CE10" = "zipcode")) %>% filter(!is.na(pop))
mapview(for_mapping_zero_case_scc %>% dplyr::select(pop))

Try excluding these.

scc_sd_cases_curr_filtered <- scc_sd_cases_curr %>% 
  filter(cases != 0) %>%
  mutate(log_cases = log(cases), log_cases_by_pop = log(cases_by_pop))

fig_scc_sd_cases_curr_filt <- plot_ly(scc_sd_cases_curr_filtered) %>%
  add_trace(x = ~percent_leaving_home, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~percent_leaving_home, y = fitted(lm(scc_sd_cases_curr_filtered$cases_by_pop~ scc_sd_cases_curr_filtered$percent_leaving_home)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent of devices leaving home'), yaxis = list(title = 'Cases per capita'), title = "SCC, cases per capita and % leaving home through 1 week before case data")

fig_scc_sd_cases_curr_filt
model_scc_curr_filt <- lm(cases_by_pop ~ percent_leaving_home, scc_sd_cases_curr_filtered)

summary(model_scc_curr_filt)
## 
## Call:
## lm(formula = cases_by_pop ~ percent_leaving_home, data = scc_sd_cases_curr_filtered)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0009837 -0.0004394 -0.0001515  0.0002312  0.0024178 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -3.210e-03  1.013e-03  -3.168  0.00273 ** 
## percent_leaving_home  9.392e-05  2.090e-05   4.493 4.71e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0007132 on 46 degrees of freedom
## Multiple R-squared:  0.305,  Adjusted R-squared:  0.2899 
## F-statistic: 20.19 on 1 and 46 DF,  p-value: 4.71e-05
fig_scc_sd_cases_curr_dif_filt <- plot_ly(scc_sd_cases_curr_filtered) %>%
  add_trace(x = ~dif_percent_leaving, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~dif_percent_leaving, y = fitted(lm(scc_sd_cases_curr_filtered$cases_by_pop~ scc_sd_cases_curr_filtered$dif_percent_leaving)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Difference in % of devices leaving home'), yaxis = list(title = 'Cases per capita'), title = "SCC, cases per capita and difference in % leaving home through 1 week before case data")

fig_scc_sd_cases_curr_dif_filt
model_scc_curr_filt_dif <- lm(cases_by_pop ~ dif_percent_leaving, scc_sd_cases_curr_filtered)

summary(model_scc_curr_filt_dif)
## 
## Call:
## lm(formula = cases_by_pop ~ dif_percent_leaving, data = scc_sd_cases_curr_filtered)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.035e-03 -4.735e-04 -9.163e-05  2.592e-04  2.812e-03 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         3.384e-03  4.817e-04   7.025 8.37e-09 ***
## dif_percent_leaving 7.249e-05  1.652e-05   4.389 6.61e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0007183 on 46 degrees of freedom
## Multiple R-squared:  0.2951, Adjusted R-squared:  0.2798 
## F-statistic: 19.26 on 1 and 46 DF,  p-value: 6.608e-05

Adjusting counting of cases by date for SF

Looking at the growth of cases from some baseline level over time. I will try using 0.0028 cases per capita as the baseline rate and seeing how much various zip codes exceed that in the week folloing when they reach that level, as dependent on their movement in the two weeks before the date they reached that level.

# set the cutoff level
cutoff_cases_by_pop <- 0.0028
# how many days after cutoff level to look
later_date_sep = 7
# how many days prior to look at movement averages
prior_movement_time = 14

# filter to just when cases are above the cutoff
sf_cases_cutoff <- sf_cases_zip_edit %>% filter(cases_by_pop >= cutoff_cases_by_pop)

zipcodes_in_cutoff <- unique(sf_cases_cutoff$zipcode)

later_cases_change <- vector(mode = "numeric", length = length(zipcodes_in_cutoff))
leaving_avg <- vector(mode = "numeric", length = length(zipcodes_in_cutoff))

for (i in 1:length(zipcodes_in_cutoff)) {
  curr_zip <- zipcodes_in_cutoff[i]
  curr_vals <- sf_cases_cutoff %>% filter(zipcode == curr_zip)
  later_cases_curr = NA
  leaving_avg_val = NA
  if (curr_vals$date[1] != sf_cases_zip_edit$date[1]) { # need to make sure it didn't start above the cutoff
    later_cases_curr = curr_vals$cases_by_pop[later_date_sep] - curr_vals$cases_by_pop[1] # change in cases an amount of time later
    # get the percent of devices leaving home in an amount of time before the date that cases went above the threshold level
    leaving_computed = sf_sd_zip_by_date %>% 
      filter(zipcode == curr_zip & date < curr_vals$date[1] & (date >= curr_vals$date[1] - prior_movement_time)) %>% 
      summarize(total_at_home = sum(total_at_home_zip), total_devices = sum(total_devices)) %>%
  mutate(percent_at_home = total_at_home*100/total_devices, percent_leaving_home = (100 - percent_at_home)) 
    leaving_avg_val = leaving_computed$percent_leaving_home
  }  
  later_cases_change[i] = later_cases_curr
  leaving_avg[i] = leaving_avg_val
}

# combine
collected_cases_growth_baseline <- data.frame(zipcodes_in_cutoff, later_cases_change, leaving_avg) %>% filter(!is.na(later_cases_change))

# plot
fig_collected_cases_growth_baseline <- plot_ly(collected_cases_growth_baseline) %>%
  add_trace(x = ~leaving_avg, y = ~later_cases_change, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~leaving_avg, y = fitted(lm(collected_cases_growth_baseline$later_cases_change~ collected_cases_growth_baseline$leaving_avg)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = '% of devices leaving home in 2 weeks before'), yaxis = list(title = 'Change in cases per capita 1 week later'), title = "SF, change in cases/person in week after >0.0028 cases/person vs % leaving in 2 weeks before")

fig_collected_cases_growth_baseline
model_collected <- lm(collected_cases_growth_baseline$later_cases_change~ collected_cases_growth_baseline$leaving_avg)

summary(model_collected)
## 
## Call:
## lm(formula = collected_cases_growth_baseline$later_cases_change ~ 
##     collected_cases_growth_baseline$leaving_avg)
## 
## Residuals:
##          1          2          3          4          5          6          7 
## -1.075e-04  8.919e-05 -3.617e-04  5.133e-04  5.867e-05 -1.235e-04 -6.844e-05 
## 
## Coefficients:
##                                               Estimate Std. Error t value
## (Intercept)                                 -2.396e-03  1.994e-03  -1.202
## collected_cases_growth_baseline$leaving_avg  5.882e-05  3.977e-05   1.479
##                                             Pr(>|t|)
## (Intercept)                                    0.283
## collected_cases_growth_baseline$leaving_avg    0.199
## 
## Residual standard error: 0.0002957 on 5 degrees of freedom
## Multiple R-squared:  0.3043, Adjusted R-squared:  0.1652 
## F-statistic: 2.187 on 1 and 5 DF,  p-value: 0.1992

Compare this to just the metric we were looking at before (number of cases and movement through 1 week before the most recent date on the case data) for just these zip codes.

# plot just these zip codes from the figure before
fig_sf_sd_cases_curr_limit <- plot_ly(sf_sd_cases_curr %>% filter(zipcode %in% zipcodes_in_cutoff)) %>%
  add_trace(x = ~percent_leaving_home, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~percent_leaving_home, y = fitted(lm(cases_by_pop~ percent_leaving_home, sf_sd_cases_curr %>% filter(zipcode %in% zipcodes_in_cutoff))), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent of devices leaving home'), yaxis = list(title = 'Cases per capita'), title = "SFC, cases/person and % leaving through 1 week before case data, just zip codes in previous analysis")

fig_sf_sd_cases_curr_limit
model_cases_limit <- lm(cases_by_pop~ percent_leaving_home, sf_sd_cases_curr %>% filter(zipcode %in% zipcodes_in_cutoff))

summary(model_cases_limit)
## 
## Call:
## lm(formula = cases_by_pop ~ percent_leaving_home, data = sf_sd_cases_curr %>% 
##     filter(zipcode %in% zipcodes_in_cutoff))
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0009563 -0.0006691 -0.0001425  0.0006696  0.0012885 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)
## (Intercept)          -0.0036064  0.0079312  -0.455    0.665
## percent_leaving_home  0.0001617  0.0001577   1.025    0.345
## 
## Residual standard error: 0.000907 on 6 degrees of freedom
## Multiple R-squared:  0.1491, Adjusted R-squared:  0.007274 
## F-statistic: 1.051 on 1 and 6 DF,  p-value: 0.3448

These two plots are not that similar, which is interesting but makes sense–more evidence that it’s likely important to take into account the original number of cases in a zip code.

Change in cases and movement

For San Francisco, since that’s the data on cases over time that we have. Below I look at the change in cases from April 21st through present. Movement is averaged from 2 weeks before April 21st through 1 week before present.

# get original levels of cases
original_cases <- sf_cases_zip %>% filter(date == (min(date)+1)) %>%
  mutate(log_orig_cases = log(confirmed_cases), log_orig_cases_by_pop = log(cases_by_pop)) %>% 
  dplyr::select(date, zipcode, confirmed_cases, cases_by_pop, log_orig_cases, log_orig_cases_by_pop) %>%
  rename(orig_date = date, orig_cases = confirmed_cases, orig_cases_by_pop = cases_by_pop)

# join with current levels of cases and movement averaged from 2 weeks before start of case data through 1 week before most recent case data
sf_sd_cases_curr_orig <- left_join(original_cases, sf_sd_cases_curr %>% dplyr::select(date, confirmed_cases, cases_by_pop, zipcode)) %>%
  mutate(log_cases = log(confirmed_cases), 
         log_cases_by_pop = log(cases_by_pop), 
         change_cases = confirmed_cases - orig_cases, 
         change_cases_by_pop = cases_by_pop - orig_cases_by_pop, 
         change_log_cases = log_cases - log_orig_cases, 
         change_log_cases_by_pop = log_cases_by_pop - log_orig_cases_by_pop, 
         perc_change_cases = change_cases*100 / orig_cases,
         perc_change_cases_by_pop = change_cases_by_pop*100/orig_cases_by_pop,
         perc_change_log_cases = change_log_cases*100/log_orig_cases,
         perc_change_log_cases_by_pop = change_log_cases_by_pop*100/log_orig_cases_by_pop) %>%
  left_join(sf_sd_zip_by_date %>%
  filter((date >= original_cases$orig_date[1] - 14) & (date <= sf_case_date_max - 7)) %>%
  group_by(zipcode) %>%
  summarize(total_at_home = sum(total_at_home_zip), total_devices = sum(total_devices)) %>%
  mutate(percent_at_home = total_at_home*100/total_devices, percent_leaving_home = (100 - percent_at_home))) %>%
  filter(!is.na(confirmed_cases))


# plot
fig_sf_sd_cases_change <- plot_ly(sf_sd_cases_curr_orig) %>%
  add_trace(x = ~percent_leaving_home, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~percent_leaving_home, y = fitted(lm(change_cases_by_pop~ percent_leaving_home, sf_sd_cases_curr_orig)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent of devices leaving home'), yaxis = list(title = 'Change in cases per capita from April 21st to present'), title = "SFC, change in cases/person and % leaving home April 7th through 1 week before case data")

fig_sf_sd_cases_change
model_cases_change <- lm(change_cases_by_pop ~ percent_leaving_home, sf_sd_cases_curr_orig)

summary(model_cases_change)
## 
## Call:
## lm(formula = change_cases_by_pop ~ percent_leaving_home, data = sf_sd_cases_curr_orig)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0010358 -0.0006275 -0.0004233  0.0006510  0.0020557 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)
## (Intercept)           4.435e-03  2.623e-03   1.691    0.104
## percent_leaving_home -6.919e-05  5.183e-05  -1.335    0.195
## 
## Residual standard error: 0.000924 on 23 degrees of freedom
## Multiple R-squared:  0.07191,    Adjusted R-squared:  0.03155 
## F-statistic: 1.782 on 1 and 23 DF,  p-value: 0.195
# plot change in log of cases
fig_sf_sd_cases_change_log <- plot_ly(sf_sd_cases_curr_orig) %>%
  add_trace(x = ~percent_leaving_home, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~percent_leaving_home, y = fitted(lm(change_log_cases_by_pop~ percent_leaving_home, sf_sd_cases_curr_orig)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent of devices leaving home'), yaxis = list(title = 'Change in log of cases per capita from April 21st to present'), title = "SFC, change in cases/person and % leaving home April 7th through 1 week before case data")

fig_sf_sd_cases_change_log
model_cases_change_log <- lm(change_log_cases_by_pop ~ percent_leaving_home, sf_sd_cases_curr_orig)

summary(model_cases_change_log)
## 
## Call:
## lm(formula = change_log_cases_by_pop ~ percent_leaving_home, 
##     data = sf_sd_cases_curr_orig)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.57606 -0.22179 -0.05493  0.06014  1.91556 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)  
## (Intercept)           2.67007    1.30591   2.045   0.0525 .
## percent_leaving_home -0.04239    0.02581  -1.642   0.1141  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4601 on 23 degrees of freedom
## Multiple R-squared:  0.105,  Adjusted R-squared:  0.06605 
## F-statistic: 2.697 on 1 and 23 DF,  p-value: 0.1141
# plot percent change in cases, filtering outliers of very very high percent changes
fig_sf_sd_cases_change_perc <- plot_ly(sf_sd_cases_curr_orig %>% filter(perc_change_cases_by_pop < 1000)) %>%
  add_trace(x = ~percent_leaving_home, y = ~perc_change_cases_by_pop, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~percent_leaving_home, y = fitted(lm(perc_change_cases_by_pop~ percent_leaving_home, sf_sd_cases_curr_orig %>% filter(perc_change_cases_by_pop < 1000))), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent of devices leaving home'), yaxis = list(title = 'Percent change in cases per capita from April 21st to present'), title = "SFC, % change in cases/person and % leaving home April 7th through 1 week before most case data")

fig_sf_sd_cases_change_perc
model_cases_change_perc <- lm(perc_change_cases_by_pop ~ percent_leaving_home, sf_sd_cases_curr_orig %>% filter(perc_change_cases_by_pop < 1000))

summary(model_cases_change_perc)
## 
## Call:
## lm(formula = perc_change_cases_by_pop ~ percent_leaving_home, 
##     data = sf_sd_cases_curr_orig %>% filter(perc_change_cases_by_pop < 
##         1000))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -68.649 -22.352  -2.686  12.393  76.408 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)   
## (Intercept)           326.350    100.109   3.260  0.00359 **
## percent_leaving_home   -5.245      1.977  -2.653  0.01452 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.19 on 22 degrees of freedom
## Multiple R-squared:  0.2424, Adjusted R-squared:  0.208 
## F-statistic: 7.039 on 1 and 22 DF,  p-value: 0.01452

This still is weird.

Correlations

Try just looking at a few correlations between demographic variables, movement, and change in cases per capita since April 21st. The movement and cases metrics are the same as in the previous section. Demographic information from ACS census data.

First occupants per room.

sf_occupants_zip <- pullCensus("group(B25014)", sf_fips) %>% 
  dplyr::select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(key = "variable", value = "estimate", -blockgroup) %>% 
  mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>% 
  dplyr::select(-variable) %>% 
  separate(label, into = c(NA, NA, NA,"occupants per room"), sep = "!!") %>% 
  filter(!is.na(`occupants per room`)) %>%
  group_by(blockgroup, `occupants per room`) %>%
  summarize(estimate_tot = sum(estimate)) %>% 
  spread(key = `occupants per room`, value = estimate_tot) %>%
  mutate(total_nums = `0.50 or less occupants per room` + `0.51 to 1.00 occupants per room` + `1.01 to 1.50 occupants per room` + `1.51 to 2.00 occupants per room` + `2.01 or more occupants per room`) %>%
  left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  group_by(zipcode) %>%
  summarize(total_nums = sum(total_nums), total_0.5less = sum(`0.50 or less occupants per room`), total_0.5to1 = sum(`0.51 to 1.00 occupants per room`), total_1to1.5 = sum(`1.01 to 1.50 occupants per room`), total_1.5to2 = sum(`1.51 to 2.00 occupants per room`), total_2more = sum(`2.01 or more occupants per room`)) %>%
  mutate(`percent more than 1` = (total_1to1.5 + total_1.5to2 + total_2more) * 100/ total_nums, `percent less than 1` = (100-`percent more than 1`), `percent 0.5 or less` = total_0.5less*100/total_nums, `percent more than 0.5` = (100-`percent 0.5 or less`), `percent more than 2` = total_2more*100/total_nums)

sf_occupants_sd_cases_change <- left_join(sf_occupants_zip, sf_sd_cases_curr_orig) %>% filter(!is.na(confirmed_cases))

# plot
fig_sf_cases_change_occupants <- plot_ly(sf_occupants_sd_cases_change) %>%
  add_trace(x = ~`percent more than 1`, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~`percent more than 1`, y = fitted(lm(change_cases_by_pop~ `percent more than 1`, sf_occupants_sd_cases_change)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent of households with more than 1 occupants per room'), yaxis = list(title = 'Change in cases per capita from April 21st to present'), title = "SFC, change in cases per capita and occupants per room")

fig_sf_cases_change_occupants
model_cases_change_occupants <- lm(change_cases_by_pop ~ `percent more than 1`, sf_occupants_sd_cases_change)

summary(model_cases_change_occupants)
## 
## Call:
## lm(formula = change_cases_by_pop ~ `percent more than 1`, data = sf_occupants_sd_cases_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0011711 -0.0006375 -0.0004407  0.0007771  0.0018708 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)   
## (Intercept)           8.841e-04  2.796e-04   3.162  0.00435 **
## `percent more than 1` 7.658e-06  2.647e-05   0.289  0.77493   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0009574 on 23 degrees of freedom
## Multiple R-squared:  0.003626,   Adjusted R-squared:  -0.03969 
## F-statistic: 0.0837 on 1 and 23 DF,  p-value: 0.7749
# try with more than 2 people
fig_sf_cases_change_occupants_2 <- plot_ly(sf_occupants_sd_cases_change) %>%
  add_trace(x = ~`percent more than 2`, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~`percent more than 2`, y = fitted(lm(change_cases_by_pop~ `percent more than 2`, sf_occupants_sd_cases_change)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent of households with more than 2 occupants per room'), yaxis = list(title = 'Change in cases per capita from April 21st to present'), title = "SFC, change in cases per capita and occupants per room")

fig_sf_cases_change_occupants_2
model_cases_change_occupants_2 <- lm(change_cases_by_pop ~ `percent more than 2`, sf_occupants_sd_cases_change)

summary(model_cases_change_occupants)
## 
## Call:
## lm(formula = change_cases_by_pop ~ `percent more than 1`, data = sf_occupants_sd_cases_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0011711 -0.0006375 -0.0004407  0.0007771  0.0018708 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)   
## (Intercept)           8.841e-04  2.796e-04   3.162  0.00435 **
## `percent more than 1` 7.658e-06  2.647e-05   0.289  0.77493   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0009574 on 23 degrees of freedom
## Multiple R-squared:  0.003626,   Adjusted R-squared:  -0.03969 
## F-statistic: 0.0837 on 1 and 23 DF,  p-value: 0.7749
# combine with social distancing
model_cases_change_occupants_sd <- lm(change_cases_by_pop ~ `percent more than 1` + percent_leaving_home, sf_occupants_sd_cases_change)

summary(model_cases_change_occupants_sd)
## 
## Call:
## lm(formula = change_cases_by_pop ~ `percent more than 1` + percent_leaving_home, 
##     data = sf_occupants_sd_cases_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0009834 -0.0006480 -0.0003397  0.0005797  0.0020671 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)  
## (Intercept)            4.731e-03  2.697e-03   1.754   0.0933 .
## `percent more than 1`  1.703e-05  2.669e-05   0.638   0.5301  
## percent_leaving_home  -7.765e-05  5.416e-05  -1.434   0.1657  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0009362 on 22 degrees of freedom
## Multiple R-squared:  0.08876,    Adjusted R-squared:  0.005919 
## F-statistic: 1.071 on 2 and 22 DF,  p-value: 0.3597
# try with absolute numbers of cases
model_cases_change_occupants_abs <- lm(change_cases ~ `percent more than 2`, sf_occupants_sd_cases_change)

summary(model_cases_change_occupants_abs)
## 
## Call:
## lm(formula = change_cases ~ `percent more than 2`, data = sf_occupants_sd_cases_change)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -41.15 -30.05 -23.15  10.85 169.11 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)   
## (Intercept)             41.152     11.561   3.559  0.00167 **
## `percent more than 2`   -1.375      3.603  -0.382  0.70622   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 52.59 on 23 degrees of freedom
## Multiple R-squared:  0.006293,   Adjusted R-squared:  -0.03691 
## F-statistic: 0.1457 on 1 and 23 DF,  p-value: 0.7062

That doesn’t seem significant.

Next try population density, using the area of the zip code regions

sf_pop_density_zip <- sf_cases_zip %>% filter(date == max(date)) %>%
  dplyr::select(zipcode, total_pop_zip) %>%
  left_join(zctas_94, by = c("zipcode" = "ZCTA5CE10")) %>%
  st_as_sf() %>%
  mutate(zip_area = st_area(.), pop_density = total_pop_zip / zip_area) %>%
  filter(!is.na(pop_density))

mapview(sf_pop_density_zip %>% dplyr::select(pop_density))
sf_pop_density_zip_sd_cases <- left_join(sf_sd_cases_curr_orig, sf_pop_density_zip) 

# plot
fig_sf_cases_change_pop_density <- plot_ly(sf_pop_density_zip_sd_cases) %>%
  add_trace(x = ~pop_density, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~pop_density, y = fitted(lm(change_cases_by_pop~ pop_density, sf_pop_density_zip_sd_cases)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Population density'), yaxis = list(title = 'Change in cases per capita from April 21st to present'), title = "SFC, change in cases per capita and population density")

fig_sf_cases_change_pop_density
model_cases_change_pop_density <- lm(change_cases_by_pop ~ pop_density, sf_pop_density_zip_sd_cases)

summary(model_cases_change_pop_density)
## 
## Call:
## lm(formula = change_cases_by_pop ~ pop_density, data = sf_pop_density_zip_sd_cases)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0008628 -0.0006550 -0.0005080  0.0007886  0.0019122 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 0.0006909  0.0003756   1.839   0.0788 .
## pop_density 0.0291396  0.0374954   0.777   0.4450  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0009468 on 23 degrees of freedom
## Multiple R-squared:  0.02559,    Adjusted R-squared:  -0.01678 
## F-statistic: 0.604 on 1 and 23 DF,  p-value: 0.445
model_cases_change_pop_density_sd <- lm(change_cases_by_pop ~ pop_density + percent_leaving_home, sf_pop_density_zip_sd_cases)

summary(model_cases_change_pop_density_sd)
## 
## Call:
## lm(formula = change_cases_by_pop ~ pop_density + percent_leaving_home, 
##     data = sf_pop_density_zip_sd_cases)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0010452 -0.0006375 -0.0003606  0.0006308  0.0019521 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)
## (Intercept)           4.130e-03  2.680e-03   1.541    0.138
## pop_density           2.751e-02  3.698e-02   0.744    0.465
## percent_leaving_home -6.786e-05  5.237e-05  -1.296    0.208
## 
## Residual standard error: 0.0009331 on 22 degrees of freedom
## Multiple R-squared:  0.09468,    Adjusted R-squared:  0.01238 
## F-statistic:  1.15 on 2 and 22 DF,  p-value: 0.3348
# absolute cases
model_cases_change_pop_density_abs <- lm(change_cases ~ pop_density, sf_pop_density_zip_sd_cases)

summary(model_cases_change_pop_density_abs)
## 
## Call:
## lm(formula = change_cases ~ pop_density, data = sf_pop_density_zip_sd_cases)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.231 -28.842 -19.763   7.146 162.939 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    23.30      20.57   1.133    0.269
## pop_density  1851.32    2053.38   0.902    0.377
## 
## Residual standard error: 51.85 on 23 degrees of freedom
## Multiple R-squared:  0.03414,    Adjusted R-squared:  -0.007858 
## F-statistic: 0.8129 on 1 and 23 DF,  p-value: 0.3766

Not significant.

Try number of people per household.

sf_house_size_zip <- pullCensus("group(B11016)", sf_fips) %>% 
  dplyr::select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(key = "variable", value = "estimate", -blockgroup) %>% 
  mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>% 
  dplyr::select(-variable) %>% 
  separate(label, into = c(NA, NA, "type", "size"), sep = "!!") %>% 
  filter(!is.na(type)) %>%
  filter(!is.na(size)) %>% 
  dplyr::select(-type) %>%
  group_by(blockgroup, size) %>%
  summarize(`total of this size` = sum(estimate)) %>% 
  spread(key = size, value = `total of this size`) %>%
  mutate(total_nums = `1-person household` + `2-person household` + `3-person household` + `4-person household` + `5-person household`+ `6-person household` + `7-or-more person household`) %>%
  left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  group_by(zipcode) %>%
  summarize(total_nums = sum(total_nums), total_1 = sum(`1-person household`), total_2 = sum(`2-person household`), total_3 = sum(`3-person household`), total_4 = sum(`4-person household`), total_5 = sum(`5-person household`), total_6 = sum(`6-person household`), total_7more = sum(`7-or-more person household`)) %>%
  mutate(`percent 5 or more` = (total_5 + total_6 + total_7more)* 100/ total_nums, `percent 1 or 2 only` = (total_1 + total_2)*100/total_nums)

sf_size_sd_cases_change <- left_join(sf_sd_cases_curr_orig, sf_house_size_zip)

# plot
fig_sf_cases_change_size <- plot_ly(sf_size_sd_cases_change) %>%
  add_trace(x = ~`percent 5 or more`, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~`percent 5 or more`, y = fitted(lm(change_cases_by_pop~ `percent 5 or more`, sf_size_sd_cases_change)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent of households with 5 or more members'), yaxis = list(title = 'Change in cases per capita from April 21st to present'), title = "SFC, change in cases per capita and household size")

fig_sf_cases_change_size
model_cases_change_size <- lm(change_cases_by_pop ~ `percent 5 or more`, sf_size_sd_cases_change)

summary(model_cases_change_size)
## 
## Call:
## lm(formula = change_cases_by_pop ~ `percent 5 or more`, data = sf_size_sd_cases_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0009514 -0.0006465 -0.0003560  0.0001575  0.0019053 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)  
## (Intercept)         6.193e-04  2.644e-04   2.343   0.0282 *
## `percent 5 or more` 5.386e-05  3.205e-05   1.680   0.1064  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0009052 on 23 degrees of freedom
## Multiple R-squared:  0.1093, Adjusted R-squared:  0.07061 
## F-statistic: 2.823 on 1 and 23 DF,  p-value: 0.1064
model_cases_change_size_sd <- lm(change_cases_by_pop ~ `percent 5 or more` + percent_leaving_home, sf_size_sd_cases_change)

summary(model_cases_change_size_sd)
## 
## Call:
## lm(formula = change_cases_by_pop ~ `percent 5 or more` + percent_leaving_home, 
##     data = sf_size_sd_cases_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0010850 -0.0006796 -0.0002646  0.0001051  0.0018903 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)
## (Intercept)           2.906e-03  2.858e-03   1.017    0.320
## `percent 5 or more`   4.379e-05  3.465e-05   1.264    0.219
## percent_leaving_home -4.412e-05  5.488e-05  -0.804    0.430
## 
## Residual standard error: 0.0009122 on 22 degrees of freedom
## Multiple R-squared:  0.1347, Adjusted R-squared:  0.05608 
## F-statistic: 1.713 on 2 and 22 DF,  p-value: 0.2035
# testing with number of cases
fig_sf_cases_change_size <- plot_ly(sf_size_sd_cases_change) %>%
  add_trace(x = ~`percent 5 or more`, y = ~change_cases, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~`percent 5 or more`, y = fitted(lm(change_cases~ `percent 5 or more`, sf_size_sd_cases_change)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent of households with 5 or more members'), yaxis = list(title = 'Change in cases per capita from April 21st to present'), title = "SFC, change in cases per capita and household size")

fig_sf_cases_change_size
model_cases_change_size_abs <- lm(change_cases ~ `percent 5 or more`, sf_size_sd_cases_change)

summary(model_cases_change_size_abs)
## 
## Call:
## lm(formula = change_cases ~ `percent 5 or more`, data = sf_size_sd_cases_change)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -51.50 -23.67 -12.88  19.69 157.91 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)  
## (Intercept)           12.135     13.301   0.912   0.3711  
## `percent 5 or more`    4.523      1.613   2.805   0.0101 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45.54 on 23 degrees of freedom
## Multiple R-squared:  0.2548, Adjusted R-squared:  0.2224 
## F-statistic: 7.866 on 1 and 23 DF,  p-value: 0.01006

Also not very significant.

Try units in structure.

sf_units_zip <- pullCensus("group(B25024)", sf_fips) %>% dplyr::select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(key = "variable", value = "estimate", -blockgroup) %>% 
  mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>% 
  dplyr::select(-variable) %>% 
  separate(label, into = c(NA, NA, "units"), sep = "!!") %>% 
  filter(!is.na(units)) %>%
  spread(key = units, value = estimate) %>%
  mutate(total_nums = `1, attached` + `1, detached` + `10 to 19` + `2` + `20 to 49`+ `3 or 4` + `5 to 9`+ `50 or more`+ `Boat, RV, van, etc.`+ `Mobile home`, total_20more = `20 to 49`+`50 or more`, total_10more = `10 to 19` + `20 to 49`+`50 or more`, total_1 = `1, attached` + `1, detached`) %>%
  left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  group_by(zipcode) %>%
  summarize(total_nums = sum(total_nums), total_20more = sum(total_20more), total_10more = sum(total_20more), total_1_only = sum(total_1)) %>%
  mutate(`percent 10 or more` = total_10more*100/total_nums, `percent 20 or more` = total_20more*100/total_nums, `percent 1 only` = total_1_only*100/total_nums, `percent more than 1` = (100 - `percent 1 only`))

sf_units_sd_cases_change <- left_join(sf_sd_cases_curr_orig, sf_units_zip)

# plot with more than 20 first
fig_sf_cases_change_units <- plot_ly(sf_units_sd_cases_change) %>%
  add_trace(x = ~`percent 20 or more`, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~`percent 20 or more`, y = fitted(lm(change_cases_by_pop~ `percent 20 or more`, sf_units_sd_cases_change)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent of structures with 20 or more units'), yaxis = list(title = 'Change in cases per capita from April 21st to present'), title = "SFC, change in cases per capita and units per structure")

fig_sf_cases_change_units
model_cases_change_units <- lm(change_cases_by_pop ~ `percent 20 or more`, sf_units_sd_cases_change)

summary(model_cases_change_units)
## 
## Call:
## lm(formula = change_cases_by_pop ~ `percent 20 or more`, data = sf_units_sd_cases_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0009335 -0.0006873 -0.0004764  0.0007709  0.0018706 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)   
## (Intercept)           9.511e-04  2.803e-04   3.393   0.0025 **
## `percent 20 or more` -2.370e-07  6.019e-06  -0.039   0.9689   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0009591 on 23 degrees of freedom
## Multiple R-squared:  6.74e-05,   Adjusted R-squared:  -0.04341 
## F-statistic: 0.00155 on 1 and 23 DF,  p-value: 0.9689
model_cases_change_units_sd <- lm(change_cases_by_pop ~ `percent 20 or more` + percent_leaving_home, sf_units_sd_cases_change)

summary(model_cases_change_size_sd)
## 
## Call:
## lm(formula = change_cases_by_pop ~ `percent 5 or more` + percent_leaving_home, 
##     data = sf_size_sd_cases_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0010850 -0.0006796 -0.0002646  0.0001051  0.0018903 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)
## (Intercept)           2.906e-03  2.858e-03   1.017    0.320
## `percent 5 or more`   4.379e-05  3.465e-05   1.264    0.219
## percent_leaving_home -4.412e-05  5.488e-05  -0.804    0.430
## 
## Residual standard error: 0.0009122 on 22 degrees of freedom
## Multiple R-squared:  0.1347, Adjusted R-squared:  0.05608 
## F-statistic: 1.713 on 2 and 22 DF,  p-value: 0.2035
# try with more than 1 unit per structure
fig_sf_cases_change_units_1 <- plot_ly(sf_units_sd_cases_change) %>%
  add_trace(x = ~`percent more than 1`, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~`percent more than 1`, y = fitted(lm(change_cases_by_pop~ `percent more than 1`, sf_units_sd_cases_change)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Percent of structures with more than 1 unit'), yaxis = list(title = 'Change in cases per capita from April 21st to present'), title = "SFC, change in cases per capita and units per structure")

fig_sf_cases_change_units_1
model_cases_change_units_1 <- lm(change_cases_by_pop ~ `percent more than 1`, sf_units_sd_cases_change)

summary(model_cases_change_units_1)
## 
## Call:
## lm(formula = change_cases_by_pop ~ `percent more than 1`, data = sf_units_sd_cases_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0008895 -0.0007033 -0.0004524  0.0007261  0.0018753 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)  
## (Intercept)            1.085e-03  4.916e-04   2.208   0.0375 *
## `percent more than 1` -2.044e-06  6.508e-06  -0.314   0.7563  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0009571 on 23 degrees of freedom
## Multiple R-squared:  0.004272,   Adjusted R-squared:  -0.03902 
## F-statistic: 0.09868 on 1 and 23 DF,  p-value: 0.7563
model_cases_change_units_sd_1 <- lm(change_cases_by_pop ~ `percent more than 1` + percent_leaving_home, sf_units_sd_cases_change)

summary(model_cases_change_units_sd_1)
## 
## Call:
## lm(formula = change_cases_by_pop ~ `percent more than 1` + percent_leaving_home, 
##     data = sf_units_sd_cases_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0010619 -0.0006486 -0.0004085  0.0006495  0.0020610 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)            4.484e-03  2.717e-03   1.650    0.113
## `percent more than 1`  7.509e-07  6.788e-06   0.111    0.913
## percent_leaving_home  -7.120e-05  5.600e-05  -1.271    0.217
## 
## Residual standard error: 0.0009445 on 22 degrees of freedom
## Multiple R-squared:  0.07242,    Adjusted R-squared:  -0.0119 
## F-statistic: 0.8588 on 2 and 22 DF,  p-value: 0.4374

That was unfortunately not very fruitful so far.

Visits

San Francisco, initial analysis

Starting to look at visits to locations, beginning with SFC.

poi_ca <- readRDS(paste0(sg_path,'ca_poi.rds'))

Start with the week before shelter-in-place started.

# week <- "2020-03-09"
# filter_loc <- "San Francisco"
# filter_loc_abr <- "sfc"
# 
# patterns <- 
#   readRDS(paste0(sg_path,'weekly-patterns/v2/main-file/',week,'-weekly-patterns-ca.rds'))
# 
# hps <- 
#   readRDS(paste0(sg_path,'weekly-patterns/v2/home-summary-file/',week,'-home-panel-summary.rds'))
# 
# sfc_patterns <-
#   patterns %>% 
#   left_join(
#     poi_ca %>%
#       dplyr::select(
#         safegraph_place_id,
#         longitude,
#         latitude,
#         naics_code
#       ), 
#     by = "safegraph_place_id"
#   ) %>% 
#   filter(!is.na(longitude)) %>% 
#   mutate(
#     long = longitude,
#     lat = latitude
#   ) %>% 
#   st_as_sf(coords = c("long","lat")) %>% 
#   st_set_crs(4326) %>% 
#   st_join(
#     counties("CA", cb=F, progress_bar=F) %>% 
#       filter(NAME == filter_loc) %>% 
#       st_transform(4326) %>% 
#       dplyr::select(NAME),
#     left=F
#   )
# 
# sfc_patterns_origins_daily <-
#   process_patterns_origins_daily(sfc_patterns,hps)
# 
# sfc_patterns_origins_daily <-
#   sfc_patterns_origins_daily %>% 
#   left_join(
#     sfc_patterns %>% 
#       as.data.frame() %>% 
#       dplyr::select(
#         safegraph_place_id,
#         location_name,
#         street_address,
#         naics_code
#       )
#   )
# 
# saveRDS(sfc_patterns_origins_daily,paste0(sg_path,"weekly-patterns/v2/",filter_loc_abr,"_patterns_",week,"_origins_daily.rds"))

sfc_patterns_origins_daily <- readRDS(paste0(sg_path,"weekly-patterns/v2/sfc_patterns_2020-03-09_origins_daily.rds"))

Try looking at the absolute number of visits in each zip code in the week before SIP and the absolute number of cases to date in each county in SF.

sf_visits_week_pre <- sfc_patterns_origins_daily %>%
  left_join(sf_bg_zctas, by = c("origin_census_block_group" = "blockgroup")) %>%
  group_by(zipcode) %>%
  summarize(visit_counts_high_zip = sum(visit_counts_high), visit_counts_low_zip = sum(visit_counts_low)) %>%
  filter(!is.na(visit_counts_high_zip))

# combine with case data
sf_visits_week_pre_cases <- left_join(sf_visits_week_pre, sf_cases_zip %>% filter(date == max(date)))
  
# plot 
fig_sf_cases_visits_week_pre <- plot_ly(sf_visits_week_pre_cases) %>%
  add_trace(x = ~visit_counts_high_zip, y = ~confirmed_cases, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~visit_counts_high_zip, y = fitted(lm(confirmed_cases~ visit_counts_high_zip, sf_visits_week_pre_cases)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Number of visits in week before SIP (high)'), yaxis = list(title = 'Current cases'), title = "SFC, cases and visits in week before SIP")

fig_sf_cases_visits_week_pre
model_cases_visits_week_pre <- lm(confirmed_cases ~ visit_counts_high_zip, sf_visits_week_pre_cases)

summary(model_cases_visits_week_pre)
## 
## Call:
## lm(formula = confirmed_cases ~ visit_counts_high_zip, data = sf_visits_week_pre_cases)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -113.26  -31.31   -9.67   44.19  172.26 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -1.120e+01  2.514e+01  -0.445 0.660171    
## visit_counts_high_zip  5.420e-04  1.183e-04   4.581 0.000132 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 68.68 on 23 degrees of freedom
## Multiple R-squared:  0.4771, Adjusted R-squared:  0.4544 
## F-statistic: 20.99 on 1 and 23 DF,  p-value: 0.0001322
# try with low just for comparison
fig_sf_cases_visits_week_pre_low <- plot_ly(sf_visits_week_pre_cases) %>%
  add_trace(x = ~visit_counts_low_zip, y = ~confirmed_cases, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~visit_counts_low_zip, y = fitted(lm(confirmed_cases~ visit_counts_low_zip, sf_visits_week_pre_cases)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Number of visits in week before SIP (low)'), yaxis = list(title = 'Current cases'), title = "SFC, cases and visits in week before SIP")

fig_sf_cases_visits_week_pre_low
model_cases_visits_week_pre_low <- lm(confirmed_cases ~ visit_counts_low_zip, sf_visits_week_pre_cases)

summary(model_cases_visits_week_pre_low)
## 
## Call:
## lm(formula = confirmed_cases ~ visit_counts_low_zip, data = sf_visits_week_pre_cases)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -113.858  -31.536   -7.397   46.414  171.954 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.305e+01  2.543e+01  -0.513 0.612604    
## visit_counts_low_zip  8.976e-04  1.954e-04   4.593 0.000128 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 68.59 on 23 degrees of freedom
## Multiple R-squared:  0.4784, Adjusted R-squared:  0.4557 
## F-statistic:  21.1 on 1 and 23 DF,  p-value: 0.0001283

That is very different from the results when we looked at percent leaving home! Interesting. Results are similar with high and low, so I’m just going to keep using the high counts for now.

But could this just be a population thing? I.e. higher population correlates with higher cases and more visits?

fig_sf_cases_pop_visits_week_pre <- plot_ly(sf_visits_week_pre_cases) %>%
  add_trace(x = ~visit_counts_high_zip, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~visit_counts_high_zip, y = fitted(lm(cases_by_pop~ visit_counts_high_zip, sf_visits_week_pre_cases)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Number of visits in week before SIP (high)'), yaxis = list(title = 'Current cases per capita'), title = "SFC, cases and visits in week before SIP")

fig_sf_cases_pop_visits_week_pre
model_cases_pop_visits_week_pre <- lm(cases_by_pop ~ visit_counts_high_zip, sf_visits_week_pre_cases)

summary(model_cases_pop_visits_week_pre)
## 
## Call:
## lm(formula = cases_by_pop ~ visit_counts_high_zip, data = sf_visits_week_pre_cases)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0018240 -0.0011406 -0.0006252  0.0009908  0.0033779 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)  
## (Intercept)           1.559e-03  6.177e-04   2.524    0.019 *
## visit_counts_high_zip 3.770e-09  2.907e-09   1.297    0.207  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001687 on 23 degrees of freedom
## Multiple R-squared:  0.06817,    Adjusted R-squared:  0.02765 
## F-statistic: 1.682 on 1 and 23 DF,  p-value: 0.2075

The relationship is definitely not as strong if you look at cases per capita. What if you pair with visits per capita?

sf_visits_week_pre_cases <- sf_visits_week_pre_cases %>%
  mutate(visits_per_capita_high = visit_counts_high_zip / total_pop_zip, visits_per_capita_low = visit_counts_low_zip / total_pop_zip)

# plot
fig_sf_cases_pop_visits_pop_week_pre <- plot_ly(sf_visits_week_pre_cases) %>%
  add_trace(x = ~visits_per_capita_high, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~visits_per_capita_high, y = fitted(lm(cases_by_pop~ visits_per_capita_high, sf_visits_week_pre_cases)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Number of visits per capita in week before SIP (high)'), yaxis = list(title = 'Current cases per capita'), title = "SFC, cases and visits in week before SIP")

fig_sf_cases_pop_visits_pop_week_pre
model_cases_pop_visits_pop_week_pre <- lm(cases_by_pop ~ visits_per_capita_high, sf_visits_week_pre_cases)

summary(model_cases_pop_visits_pop_week_pre)
## 
## Call:
## lm(formula = cases_by_pop ~ visits_per_capita_high, data = sf_visits_week_pre_cases)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0018453 -0.0012560 -0.0006777  0.0008816  0.0033754 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)            0.0006915  0.0012934   0.535    0.598
## visits_per_capita_high 0.0003223  0.0002615   1.233    0.230
## 
## Residual standard error: 0.001693 on 23 degrees of freedom
## Multiple R-squared:  0.06197,    Adjusted R-squared:  0.02118 
## F-statistic: 1.519 on 1 and 23 DF,  p-value: 0.2302

Again not significant results, but at least the trend is in the direction we would expect.

Also do multiple regression with absolute cases, absolute visits, and population

cases_pop_visits_sf <- lm(confirmed_cases ~ visit_counts_high_zip + total_pop_zip, sf_visits_week_pre_cases)

summary(cases_pop_visits_sf)
## 
## Call:
## lm(formula = confirmed_cases ~ visit_counts_high_zip + total_pop_zip, 
##     data = sf_visits_week_pre_cases)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -109.63  -48.03  -12.41   33.74  165.39 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)           -2.493e+01  2.832e+01  -0.880    0.388
## visit_counts_high_zip  6.335e-05  4.728e-04   0.134    0.895
## total_pop_zip          2.885e-03  2.760e-03   1.045    0.307
## 
## Residual standard error: 68.54 on 22 degrees of freedom
## Multiple R-squared:  0.5019, Adjusted R-squared:  0.4566 
## F-statistic: 11.08 on 2 and 22 DF,  p-value: 0.0004687

Hmm…

Let’s do exactly the same thing but with SCC just for comparison.

Santa Clara County, initial analysis

# week <- "2020-03-09"
# filter_loc <- "Santa Clara"
# filter_loc_abr <- "scc"
# 
# patterns <- 
#   readRDS(paste0(sg_path,'weekly-patterns/v2/main-file/',week,'-weekly-patterns-ca.rds'))
# 
# hps <- 
#   readRDS(paste0(sg_path,'weekly-patterns/v2/home-summary-file/',week,'-home-panel-summary.rds'))
# 
# scc_patterns <-
#   patterns %>% 
#   left_join(
#     poi_ca %>%
#       dplyr::select(
#         safegraph_place_id,
#         longitude,
#         latitude,
#         naics_code
#       ), 
#     by = "safegraph_place_id"
#   ) %>% 
#   filter(!is.na(longitude)) %>% 
#   mutate(
#     long = longitude,
#     lat = latitude
#   ) %>% 
#   st_as_sf(coords = c("long","lat")) %>% 
#   st_set_crs(4326) %>% 
#   st_join(
#     counties("CA", cb=F, progress_bar=F) %>% 
#       filter(NAME == filter_loc) %>% 
#       st_transform(4326) %>% 
#       dplyr::select(NAME),
#     left=F
#   )
# 
# scc_patterns_origins_daily <-
#   process_patterns_origins_daily(scc_patterns,hps)
# 
# scc_patterns_origins_daily <-
#   scc_patterns_origins_daily %>% 
#   left_join(
#     scc_patterns %>% 
#       as.data.frame() %>% 
#       dplyr::select(
#         safegraph_place_id,
#         location_name,
#         street_address,
#         naics_code
#       )
#   )
# 
# saveRDS(scc_patterns_origins_daily,paste0(sg_path,"weekly-patterns/v2/",filter_loc_abr,"_patterns_",week,"_origins_daily.rds"))

scc_patterns_origins_daily <- readRDS(paste0(sg_path,"weekly-patterns/v2/scc_patterns_2020-03-09_origins_daily.rds"))

Try looking at the absolute number of visits in each zip code in the week before SIP and the absolute number of cases to date in each county in SCC.

scc_visits_week_pre <- scc_patterns_origins_daily %>%
  left_join(scc_bg_zctas, by = c("origin_census_block_group" = "blockgroup")) %>%
  group_by(zipcode) %>%
  summarize(visit_counts_high_zip = sum(visit_counts_high), visit_counts_low_zip = sum(visit_counts_low)) %>%
  filter(!is.na(visit_counts_high_zip))

# combine with case data
scc_visits_week_pre_cases <- left_join(scc_visits_week_pre, scc_cases_zip) %>%
  filter(!is.na(cases))
  
# plot 
fig_scc_cases_visits_week_pre <- plot_ly(scc_visits_week_pre_cases) %>%
  add_trace(x = ~visit_counts_high_zip, y = ~cases, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~visit_counts_high_zip, y = fitted(lm(cases~ visit_counts_high_zip, scc_visits_week_pre_cases)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Number of visits in week before SIP (high)'), yaxis = list(title = 'Current cases'), title = "SCC, cases and visits in week before SIP")

fig_scc_cases_visits_week_pre
scc_model_cases_visits_week_pre <- lm(cases ~ visit_counts_high_zip, scc_visits_week_pre_cases)

summary(scc_model_cases_visits_week_pre)
## 
## Call:
## lm(formula = cases ~ visit_counts_high_zip, data = scc_visits_week_pre_cases)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -54.061 -20.230  -3.022   9.316 154.928 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -9.622e+00  9.634e+00  -0.999    0.322    
## visit_counts_high_zip  1.788e-04  2.756e-05   6.486 3.03e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36.01 on 53 degrees of freedom
## Multiple R-squared:  0.4425, Adjusted R-squared:  0.432 
## F-statistic: 42.06 on 1 and 53 DF,  p-value: 3.03e-08
# try with low just for comparison
fig_scc_cases_visits_week_pre_low <- plot_ly(scc_visits_week_pre_cases) %>%
  add_trace(x = ~visit_counts_low_zip, y = ~cases, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~visit_counts_low_zip, y = fitted(lm(cases~ visit_counts_low_zip, scc_visits_week_pre_cases)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Number of visits in week before SIP (low)'), yaxis = list(title = 'Current cases'), title = "SCC, cases and visits in week before SIP")

fig_scc_cases_visits_week_pre_low
scc_model_cases_visits_week_pre_low <- lm(cases ~ visit_counts_low_zip, scc_visits_week_pre_cases)

summary(scc_model_cases_visits_week_pre_low)
## 
## Call:
## lm(formula = cases ~ visit_counts_low_zip, data = scc_visits_week_pre_cases)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.581 -20.159  -5.278   6.761 159.506 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -6.725e+00  9.908e+00  -0.679      0.5    
## visit_counts_low_zip  2.542e-04  4.251e-05   5.981 1.94e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37.27 on 53 degrees of freedom
## Multiple R-squared:  0.403,  Adjusted R-squared:  0.3917 
## F-statistic: 35.77 on 1 and 53 DF,  p-value: 1.942e-07

Cases per capita:

fig_scc_cases_pop_visits_week_pre <- plot_ly(scc_visits_week_pre_cases) %>%
  add_trace(x = ~visit_counts_high_zip, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~visit_counts_high_zip, y = fitted(lm(cases_by_pop~ visit_counts_high_zip, scc_visits_week_pre_cases)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Number of visits in week before SIP (high)'), yaxis = list(title = 'Current cases per capita'), title = "SCC, cases and visits in week before SIP")

fig_scc_cases_pop_visits_week_pre
scc_model_cases_pop_visits_week_pre <- lm(cases_by_pop ~ visit_counts_high_zip, scc_visits_week_pre_cases)

summary(scc_model_cases_pop_visits_week_pre)
## 
## Call:
## lm(formula = cases_by_pop ~ visit_counts_high_zip, data = scc_visits_week_pre_cases)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0010663 -0.0005828 -0.0001802  0.0004813  0.0035539 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)   
## (Intercept)           7.800e-04  2.374e-04   3.286  0.00181 **
## visit_counts_high_zip 1.230e-09  6.791e-10   1.812  0.07568 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0008872 on 53 degrees of freedom
## Multiple R-squared:  0.05833,    Adjusted R-squared:  0.04056 
## F-statistic: 3.283 on 1 and 53 DF,  p-value: 0.07568

Visits per capita

scc_visits_week_pre_cases <- scc_visits_week_pre_cases %>%
  mutate(visits_per_capita_high = visit_counts_high_zip / pop, visits_per_capita_low = visit_counts_low_zip / pop)

# plot
fig_scc_cases_pop_visits_pop_week_pre <- plot_ly(scc_visits_week_pre_cases) %>%
  add_trace(x = ~visits_per_capita_high, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~visits_per_capita_high, y = fitted(lm(cases_by_pop~ visits_per_capita_high, scc_visits_week_pre_cases)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Number of visits per capita in week before SIP (high)'), yaxis = list(title = 'Current cases per capita'), title = "SCC, cases and visits in week before SIP")

fig_scc_cases_pop_visits_pop_week_pre
scc_model_cases_pop_visits_pop_week_pre <- lm(cases_by_pop ~ visits_per_capita_high, scc_visits_week_pre_cases)

summary(scc_model_cases_pop_visits_pop_week_pre)
## 
## Call:
## lm(formula = cases_by_pop ~ visits_per_capita_high, data = scc_visits_week_pre_cases)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.611e-03 -5.623e-04 -9.265e-05  3.386e-04  2.869e-03 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             2.331e-03  6.408e-04   3.637 0.000625 ***
## visits_per_capita_high -1.344e-04  7.175e-05  -1.873 0.066582 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0008855 on 53 degrees of freedom
## Multiple R-squared:  0.06208,    Adjusted R-squared:  0.04439 
## F-statistic: 3.508 on 1 and 53 DF,  p-value: 0.06658

That’s very weird…

Also do multiple regression with absolute cases, absolute visits, and population

cases_pop_visits_scc <- lm(cases ~ visit_counts_high_zip + pop, scc_visits_week_pre_cases)

summary(cases_pop_visits_scc)
## 
## Call:
## lm(formula = cases ~ visit_counts_high_zip + pop, data = scc_visits_week_pre_cases)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -62.108 -18.294  -3.583  10.165 148.027 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)  
## (Intercept)           -1.430e+01  9.722e+00  -1.471   0.1473  
## visit_counts_high_zip -1.662e-05  1.062e-04  -0.156   0.8763  
## pop                    1.844e-03  9.700e-04   1.901   0.0628 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.15 on 52 degrees of freedom
## Multiple R-squared:  0.4787, Adjusted R-squared:  0.4587 
## F-statistic: 23.88 on 2 and 52 DF,  p-value: 4.408e-08

Comparing visits and % leaving home

Looking at the differences in visits and % leaving home for the week before SIP.

# save previous results as another name since I edit that name below
sfc_patterns_origins_daily_week_pre <- sfc_patterns_origins_daily

# get % leaving in week before SIP
sf_sd_zip_week_pre <- sf_sd_zip_by_date %>%
  filter(date < shelter_date & date >= (shelter_date - 7)) %>%
  group_by(zipcode) %>%
  summarize(total_at_home_zip = sum(total_at_home_zip), total_devices = sum(total_devices)) %>% 
  mutate(percent_at_home = total_at_home_zip*100/total_devices, percent_leaving_home = (100 - percent_at_home)) %>% 
  left_join(sf_pre_sd_leaving_avg %>% dplyr::select(zipcode, percent_leaving_home_pre)) %>%
  mutate(dif_percent_leaving = percent_leaving_home - percent_leaving_home_pre)

sf_sd_visits_week_pre <- left_join(sf_sd_zip_week_pre, sf_visits_week_pre_cases) %>% filter(!is.na(percent_leaving_home) & !is.na(visits_per_capita_high))

# plot
fig_sfc_visits_pop_leaving_week_pre <- plot_ly(sf_sd_visits_week_pre) %>%
  add_trace(x = ~visits_per_capita_high, y = ~percent_leaving_home, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~visits_per_capita_high, y = fitted(lm(percent_leaving_home~ visits_per_capita_high, sf_sd_visits_week_pre)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Number of visits per capita in week before SIP (high)'), yaxis = list(title = '% leaving home in week before SIP'), title = "SFC, leaving home and visits in week before SIP")

fig_sfc_visits_pop_leaving_week_pre

That’s odd, but makes sense why the two metrics give us different results.

What about total devices leaving and total visits?

sf_sd_visits_week_pre <- sf_sd_visits_week_pre %>% mutate(total_leaving_home_zip = total_devices - total_at_home_zip)

fig_sfc_visits_leaving_week_pre <- plot_ly(sf_sd_visits_week_pre) %>%
  add_trace(x = ~visit_counts_high_zip, y = ~total_leaving_home_zip, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~visit_counts_high_zip, y = fitted(lm(total_leaving_home_zip~ visit_counts_high_zip, sf_sd_visits_week_pre)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Number of visits in week before SIP (high)'), yaxis = list(title = 'Number of devices leaving home in week before SIP'), title = "SFC, leaving home and visits in week before SIP")

fig_sfc_visits_leaving_week_pre

Well that’s a relationship we would expect. Interesting.

Try SCC.

# get % leaving in week before SIP
scc_sd_zip_week_pre <- scc_sd_zip_by_date %>%
  filter(date < shelter_date & date >= (shelter_date - 7)) %>%
  group_by(zipcode) %>%
  summarize(total_at_home_zip = sum(total_at_home_zip), total_devices = sum(total_devices)) %>% 
  mutate(percent_at_home = total_at_home_zip*100/total_devices, percent_leaving_home = (100 - percent_at_home)) %>% 
  left_join(sf_pre_sd_leaving_avg %>% dplyr::select(zipcode, percent_leaving_home_pre)) %>%
  mutate(dif_percent_leaving = percent_leaving_home - percent_leaving_home_pre)

scc_sd_visits_week_pre <- left_join(scc_sd_zip_week_pre, scc_visits_week_pre_cases) %>% filter(!is.na(percent_leaving_home) & !is.na(visits_per_capita_high))

# plot
fig_scc_visits_pop_leaving_week_pre <- plot_ly(scc_sd_visits_week_pre) %>%
  add_trace(x = ~visits_per_capita_high, y = ~percent_leaving_home, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~visits_per_capita_high, y = fitted(lm(percent_leaving_home~ visits_per_capita_high, scc_sd_visits_week_pre)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Number of visits per capita in week before SIP (high)'), yaxis = list(title = '% leaving home in week before SIP'), title = "SCC, leaving home and visits in week before SIP")

fig_scc_visits_pop_leaving_week_pre

Remove outlier at bottom right that is barely leaving home and plot again

# plot without outliers
fig_scc_visits_pop_leaving_week_pre_edit <- plot_ly(scc_sd_visits_week_pre %>% filter(percent_leaving_home > 50)) %>%
  add_trace(x = ~visits_per_capita_high, y = ~percent_leaving_home, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~visits_per_capita_high, y = fitted(lm(percent_leaving_home~ visits_per_capita_high, scc_sd_visits_week_pre %>% filter(percent_leaving_home > 50))), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Number of visits per capita in week before SIP (high)'), yaxis = list(title = '% leaving home in week before SIP'), title = "SCC, leaving home and visits in week before SIP")

fig_scc_visits_pop_leaving_week_pre_edit

Still not totally what we would expect (a linear relationship).

Total devices leaving and total visits.

scc_sd_visits_week_pre <- scc_sd_visits_week_pre %>% mutate(total_leaving_home_zip = total_devices - total_at_home_zip)

fig_scc_visits_leaving_week_pre <- plot_ly(scc_sd_visits_week_pre) %>%
  add_trace(x = ~visit_counts_high_zip, y = ~total_leaving_home_zip, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~visit_counts_high_zip, y = fitted(lm(total_leaving_home_zip~ visit_counts_high_zip, scc_sd_visits_week_pre)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Number of visits in week before SIP (high)'), yaxis = list(title = 'Number of devices leaving home in week before SIP'), title = "SCC, leaving home and visits in week before SIP")

fig_scc_visits_leaving_week_pre

Also expected. Interesting that these metrics diverge.

Would it be more appropriate to normalize number of visits by number of devices rather than population, then?

Preparing more data

## San Francisco
# for (i in 1:9) {
# week_init <- as.Date("2020-03-09")
# week <- week_init + 7*i
# filter_loc <- "San Francisco"
# filter_loc_abr <- "sfc"
# 
# patterns <- 
#   readRDS(paste0(sg_path,'weekly-patterns/v2/main-file/',week,'-weekly-patterns-ca.rds'))
# 
# hps <- 
#   readRDS(paste0(sg_path,'weekly-patterns/v2/home-summary-file/',week,'-home-panel-summary.rds'))
# 
# sfc_patterns <-
#   patterns %>% 
#   left_join(
#     poi_ca %>%
#       dplyr::select(
#         safegraph_place_id,
#         longitude,
#         latitude,
#         naics_code
#       ), 
#     by = "safegraph_place_id"
#   ) %>% 
#   filter(!is.na(longitude)) %>% 
#   mutate(
#     long = longitude,
#     lat = latitude
#   ) %>% 
#   st_as_sf(coords = c("long","lat")) %>% 
#   st_set_crs(4326) %>% 
#   st_join(
#     counties("CA", cb=F, progress_bar=F) %>% 
#       filter(NAME == filter_loc) %>% 
#       st_transform(4326) %>% 
#       dplyr::select(NAME),
#     left=F
#   )
# 
# sfc_patterns_origins_daily <-
#   process_patterns_origins_daily(sfc_patterns,hps)
# 
# sfc_patterns_origins_daily <-
#   sfc_patterns_origins_daily %>% 
#   left_join(
#     sfc_patterns %>% 
#       as.data.frame() %>% 
#       dplyr::select(
#         safegraph_place_id,
#         location_name,
#         street_address,
#         naics_code
#       )
#   )
# 
# saveRDS(sfc_patterns_origins_daily,paste0(sg_path,"weekly-patterns/v2/",filter_loc_abr,"_patterns_",week,"_origins_daily.rds"))
# }

Total visits since SIP, SFC

Let’s look at total visits since SIP.

sfc_patterns_origins_daily_week1 <- readRDS(paste0(sg_path,"weekly-patterns/v2/sfc_patterns_2020-03-16_origins_daily.rds"))

sfc_patterns_origins_daily_week2 <- readRDS(paste0(sg_path,"weekly-patterns/v2/sfc_patterns_2020-03-23_origins_daily.rds"))

sfc_patterns_origins_daily_week3 <- readRDS(paste0(sg_path,"weekly-patterns/v2/sfc_patterns_2020-03-30_origins_daily.rds"))

sfc_patterns_origins_daily_week4 <- readRDS(paste0(sg_path,"weekly-patterns/v2/sfc_patterns_2020-04-06_origins_daily.rds"))

sfc_patterns_origins_daily_week5 <- readRDS(paste0(sg_path,"weekly-patterns/v2/sfc_patterns_2020-04-13_origins_daily.rds"))

sfc_patterns_origins_daily_week6 <- readRDS(paste0(sg_path,"weekly-patterns/v2/sfc_patterns_2020-04-20_origins_daily.rds"))

sfc_patterns_origins_daily_week7 <- readRDS(paste0(sg_path,"weekly-patterns/v2/sfc_patterns_2020-04-27_origins_daily.rds"))

sfc_patterns_origins_daily_week8 <- readRDS(paste0(sg_path,"weekly-patterns/v2/sfc_patterns_2020-05-04_origins_daily.rds"))

sfc_patterns_origins_daily_week9 <- readRDS(paste0(sg_path,"weekly-patterns/v2/sfc_patterns_2020-05-11_origins_daily.rds"))

# combine
sfc_patterns_origins_daily_since_SIP <- rbind(sfc_patterns_origins_daily_week1, sfc_patterns_origins_daily_week2,sfc_patterns_origins_daily_week3,sfc_patterns_origins_daily_week4,sfc_patterns_origins_daily_week5,sfc_patterns_origins_daily_week6,sfc_patterns_origins_daily_week7,sfc_patterns_origins_daily_week8,sfc_patterns_origins_daily_week9)

# summarize
sf_visits_since_SIP <- sfc_patterns_origins_daily_since_SIP %>%
  left_join(sf_bg_zctas, by = c("origin_census_block_group" = "blockgroup")) %>%
  group_by(zipcode) %>%
  summarize(visit_counts_high_zip = sum(visit_counts_high), visit_counts_low_zip = sum(visit_counts_low)) %>%
  filter(!is.na(visit_counts_high_zip))

sf_visits_since_SIP_cases <- left_join(sf_visits_since_SIP, sf_cases_zip %>% filter(date == max(date)))
  
# plot 
fig_sf_cases_visits_since_SIP <- plot_ly(sf_visits_since_SIP_cases) %>%
  add_trace(x = ~visit_counts_high_zip, y = ~confirmed_cases, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~visit_counts_high_zip, y = fitted(lm(confirmed_cases~ visit_counts_high_zip, sf_visits_since_SIP_cases)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Number of visits since SIP (high)'), yaxis = list(title = 'Current cases'), title = "SFC, cases and visits since SIP")

fig_sf_cases_visits_since_SIP
model_cases_visits_since_SIP <- lm(confirmed_cases ~ visit_counts_high_zip, sf_visits_since_SIP_cases)

summary(model_cases_visits_since_SIP)
## 
## Call:
## lm(formula = confirmed_cases ~ visit_counts_high_zip, data = sf_visits_since_SIP_cases)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -125.419  -30.677   -7.659   40.505  162.880 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -1.394e+01  2.442e+01  -0.571    0.574    
## visit_counts_high_zip  9.686e-05  1.996e-05   4.852 6.73e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 66.77 on 23 degrees of freedom
## Multiple R-squared:  0.5059, Adjusted R-squared:  0.4844 
## F-statistic: 23.55 on 1 and 23 DF,  p-value: 6.729e-05
# try with low just for comparison
fig_sf_cases_visits_since_SIP_low <- plot_ly(sf_visits_since_SIP_cases) %>%
  add_trace(x = ~visit_counts_low_zip, y = ~confirmed_cases, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~visit_counts_low_zip, y = fitted(lm(confirmed_cases~ visit_counts_low_zip, sf_visits_since_SIP_cases)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Number of visits since SIP (low)'), yaxis = list(title = 'Current cases'), title = "SFC, cases and visits since SIP")

fig_sf_cases_visits_since_SIP_low
model_cases_visits_since_SIP_low <- lm(confirmed_cases ~ visit_counts_low_zip, sf_visits_since_SIP_cases)

summary(model_cases_visits_since_SIP_low)
## 
## Call:
## lm(formula = confirmed_cases ~ visit_counts_low_zip, data = sf_visits_since_SIP_cases)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -123.817  -29.666   -5.974   38.802  162.414 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.578e+01  2.462e+01  -0.641    0.528    
## visit_counts_low_zip  1.499e-04  3.072e-05   4.879 6.29e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 66.58 on 23 degrees of freedom
## Multiple R-squared:  0.5086, Adjusted R-squared:  0.4873 
## F-statistic: 23.81 on 1 and 23 DF,  p-value: 6.294e-05

Looks pretty similar to the week before SIP figure.

What about just the week after/around SIP? (3/16-3/22)

sf_visits_week1 <- sfc_patterns_origins_daily_week1 %>%
  left_join(sf_bg_zctas, by = c("origin_census_block_group" = "blockgroup")) %>%
  group_by(zipcode) %>%
  summarize(visit_counts_high_zip = sum(visit_counts_high), visit_counts_low_zip = sum(visit_counts_low)) %>%
  filter(!is.na(visit_counts_high_zip))

sf_visits_week1_cases <- left_join(sf_visits_week1, sf_cases_zip %>% filter(date == max(date)))
  
# plot 
fig_sf_cases_visits_week1 <- plot_ly(sf_visits_week1_cases) %>%
  add_trace(x = ~visit_counts_high_zip, y = ~confirmed_cases, type = 'scatter', mode = 'markers') %>%
  add_trace(x = ~visit_counts_high_zip, y = fitted(lm(confirmed_cases~ visit_counts_high_zip, sf_visits_week1_cases)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Number of visits since SIP (high)'), yaxis = list(title = 'Current cases'), title = "SFC, cases and visits in week after SIP ( 3/16-3/22)")

fig_sf_cases_visits_week1
model_cases_visits_week1 <- lm(confirmed_cases ~ visit_counts_high_zip, sf_visits_week1_cases)

summary(model_cases_visits_week1)
## 
## Call:
## lm(formula = confirmed_cases ~ visit_counts_high_zip, data = sf_visits_week1_cases)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -119.730  -35.372    1.163   39.723  166.097 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -1.457e+01  2.460e+01  -0.592    0.559    
## visit_counts_high_zip  8.286e-04  1.713e-04   4.837    7e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 66.88 on 23 degrees of freedom
## Multiple R-squared:  0.5042, Adjusted R-squared:  0.4827 
## F-statistic: 23.39 on 1 and 23 DF,  p-value: 6.997e-05

Note that my “1 week post” actually starts 3/16 instead of 3/17, but that could be adjusted, I just kept it like this now since it was easier since that’s already how the data was bucketed. Could easily adjust this later though.